quietly include ../../../fem_env.do *** Assign actual value from the 50-quantiles * Make datasets describing the biomarker distributions - do this by collapsing into 50 "equal"-sized groups use "$dua_rand_hrs/bio_imp_all.dta" , clear keep hhidpn wave hdl pct_hdl tchol pct_tchol a1c pct_a1c sysbp pct_sysbp crp pct_crp xtile hdl_grp = hdl, nq(50) xtile tchol_grp = tchol, nq(50) xtile a1c_grp = a1c, nq(50) xtile sysbp_grp = sysbp, nq(50) xtile crp_grp = crp, nq(50) tabstat hdl, by(hdl_grp) tabstat tchol, by(tchol_grp) tabstat a1c, by(a1c_grp) tabstat sysbp, by(sysbp_grp) tabstat crp, by(crp_grp) preserve collapse (mean) hdl, by(hdl_grp) rename hdl hdl_dist gen n = _n drop if hdl_grp == . rename hdl_grp grp list grp tempfile hdl_grp save `hdl_grp' restore preserve collapse (mean) tchol, by(tchol_grp) rename tchol tchol_dist gen n = _n drop if tchol_grp == . rename tchol_grp grp list grp tempfile tchol_grp save `tchol_grp' restore preserve collapse (mean) a1c, by(a1c_grp) rename a1c a1c_dist gen n = _n drop if a1c_grp == . rename a1c_grp grp list grp tempfile a1c_grp save `a1c_grp' restore preserve collapse (mean) sysbp, by(sysbp_grp) rename sysbp sysbp_dist gen n = _n drop if sysbp_grp == . rename sysbp_grp grp list grp tempfile sysbp_grp save `sysbp_grp' restore preserve collapse (mean) crp, by(crp_grp) rename crp crp_dist gen n = _n drop if crp_grp == . rename crp_grp grp list grp tempfile crp_grp save `crp_grp' merge 1:1 grp using `hdl_grp', keepusing (grp hdl_dist) nogen merge 1:1 grp using `tchol_grp' , keepusing (grp tchol_dist) nogen merge 1:1 grp using `a1c_grp' , keepusing (grp a1c_dist) nogen merge 1:1 grp using `sysbp_grp' , keepusing (grp sysbp_dist) nogen save "$dua_rand_hrs/bio_grp.dta" , replace * Process the biomarker and reconstruct the distributions by interpolation use "$dua_rand_hrs/bio_grp.dta", replace * Move from the 50 categories to tenths of percentiles expand 20 sort grp gen id = (_n-1)/10 * Set values to be used for imputation at middle of each group replace hdl_dist = . if mod(id+1,2)>0 replace tchol_dist = . if mod(id+1,2)>0 replace a1c_dist = . if mod(id+1,2)>0 replace sysbp_dist = . if mod(id+1,2)>0 replace crp_dist = . if mod(id+1,2)>0 * Set lowest values replace hdl_dist = 12.11 if id == 0 replace tchol_dist = 61.17 if id == 0 replace a1c_dist = 3 if id == 0 replace sysbp_dist = 37.6 if id == 0 replace crp_dist = 0.2 if id == 0 * In case we need a 100th expand 2 if _n == _N replace id = 100 if _n == _N ipolate hdl_dist id, gen(hdl_imp) epolate ipolate tchol_dist id, gen(tchol_imp) epolate ipolate a1c_dist id, gen(a1c_imp) epolate ipolate sysbp_dist id, gen(sysbp_imp) epolate ipolate crp_dist id, gen(crp_imp) epolate gen hdl_tiles = id gen tchol_tiles = id gen a1c_tiles = id gen sysbp_tiles = id gen crp_tiles = id replace hdl_dist = hdl_imp replace tchol_dist = tchol_imp replace a1c_dist = a1c_imp replace sysbp_dist = sysbp_imp replace crp_dist = crp_imp keep grp hdl_tiles hdl_dist tchol_tiles tchol_dist a1c_tiles a1c_dist sysbp_tiles sysbp_dist crp_tiles crp_dist save "$dua_rand_hrs/bio_pct.dta", replace ********************************************** * Convert these categories into continuous values, then assign to values * Get 50-55 2004 distribution and use this to convert categorical to continous (we have 1000 points in our distribution) ********************************************** use "$dua_rand_hrs/bio_imp_all.dta", clear xtset hhidpn wave foreach var in hdl tchol a1c sysbp crp { gen l2pct_`var' = L.pct_`var' gen f2pct_`var' = F.pct_`var' replace pct_`var' = f2pct_`var' if pct_`var' == . & wave == 8 replace pct_`var' = l2pct_`var' if pct_`var' == . & wave == 12 ** Fill the population mean value for the rest of missing obs egen mean_pct_`var' = mean(pct_`var'), by(wave) replace mean_pct_`var' = round(mean_pct_`var') replace pct_`var' = mean_pct_`var' if pct_`var' == . drop l2pct_`var' f2pct_`var' mean_pct_`var' count if pct_`var' == . } gen hdl_draw = runiform() gen tchol_draw = runiform() gen a1c_draw = runiform() gen sysbp_draw = runiform() gen crp_draw = runiform() gen hdl_tiles = round((100/50)*pct_hdl - (100/50)*hdl_draw,0.1) gen tchol_tiles = round((100/50)*pct_tchol - (100/50)*tchol_draw,0.1) gen a1c_tiles = round((100/50)*pct_a1c - (100/50)*a1c_draw,0.1) gen sysbp_tiles = round((100/50)*pct_sysbp - (100/50)*sysbp_draw,0.1) gen crp_tiles = round((100/50)*pct_crp - (100/50)*crp_draw,0.1) * Fill in the values merge m:1 hdl_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(hdl_dist) nogen merge m:1 tchol_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(tchol_dist) nogen merge m:1 a1c_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(a1c_dist) nogen merge m:1 sysbp_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(sysbp_dist) nogen merge m:1 crp_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(crp_dist) nogen sort hhidpn wave rename hdl_dist hdl_imp rename tchol_dist tchol_imp rename a1c_dist a1c_imp rename sysbp_dist sysbp_imp rename crp_dist crp_imp gen hdl_miss = (hdl == .) gen tchol_miss = (tchol == .) gen a1c_miss = (a1c == .) gen sysbp_miss = (sysbp == .) gen crp_miss = (crp == .) keep hhidpn wave pct_hdl hdl_tiles hdl_imp hdl_miss pct_tchol tchol_tiles tchol_imp tchol_miss pct_a1c a1c_tiles a1c_imp a1c_miss pct_sysbp sysbp_tiles sysbp_imp sysbp_miss pct_crp crp_tiles crp_imp crp_miss merge 1:1 hhidpn wave using "$dua_rand_hrs/bio_hrs_recoded.dta", keep (master match) nogen replace hdl_imp = hdl if hdl_miss != 1 replace tchol_imp = tchol if tchol_miss != 1 replace a1c_imp = a1c if a1c_miss != 1 replace sysbp_imp = sysbp if sysbp_miss != 1 replace crp_imp = crp if crp_miss != 1 ** Test the distribution foreach var in hdl tchol a1c sysbp crp { sum `var' [aw=biowgtr], d sum `var'_imp [aw=wtresp], d } ** Cap the values after the extrapolation replace hdl_imp = 12 if hdl_imp < 12 replace hdl_imp = 146 if hdl_imp >= 146 & hdl_imp != . replace tchol_imp = 61 if tchol_imp < 61 replace tchol_imp = 641 if tchol_imp >= 641 & tchol_imp != . replace a1c_imp = 3 if a1c_imp < 3 replace a1c_imp = 17.6 if a1c_imp >= 17.6 & a1c_imp != . replace sysbp_imp = 37 if sysbp_imp < 37 replace sysbp_imp = 227 if sysbp_imp >= 227 & sysbp_imp != . save "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta", replace ******************************************************** * Impute ldl data by NHANES, using imputed HDL and tchol ******************************************************** use "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta", clear drop hdl tchol rxchol_cancre tchol_hdl tchol_hearte tchol_storke tchol_diabe tchol_hibpe tchol_cancre hdl_hearte hdl_storke hdl_diabe hdl_hibpe hdl_cancre rename hdl_imp hdl rename tchol_imp tchol count if hdl != . count if tchol != . gen tchol_hdl = tchol*hdl gen tchol_hearte = tchol*hearte gen tchol_storke = tchol*stroke gen tchol_diabe = tchol*diabe gen tchol_hibpe = tchol*hibpe gen tchol_cancre = tchol*cancre gen hdl_hearte = hdl*hearte gen hdl_storke = hdl*stroke gen hdl_diabe = hdl*diabe gen hdl_hibpe = hdl*hibpe gen hdl_cancre = hdl*cancre gen rxchol_stroke = rxchol*stroke gen rxchol_hearte = rxchol*hearte gen rxchol_diabe = rxchol*diabe gen rxchol_cancre = rxchol*cancre est use "$local_path/Estimates/nhanes_biomarkers/nhanes_ldl.ster" predict ldl_imp, xb sum ldl_imp [aw=wtresp], detail est use "$local_path/Estimates/nhanes_biomarkers/nhanes_ldl_minimal.ster" predict ldl_min, xb replace ldl_imp = ldl_min if ldl_imp == . replace ldl_imp = 10.06 if ldl_imp <= 10 keep hhidpn wave ldl_imp tempfile ldl_imp save `ldl_imp' use "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta" merge 1:1 hhidpn wave using `ldl_imp', keep (master match) nogen save, replace *************************************************************************************************** * Available imputed biomarker - ldl hdl tchol a1c sysbp crp * Generate the biomarker_re as placing the population mean for the missing biomarkers * Generate missing flag variables * Generate l4 for all the variables *************************************************************************************************** * Make a panel xtset hhidpn wave gen logcrp_imp = log(crp_imp) * Gen 2 year lagged variable for imputed hdl tchol ldl a1c sysbp gen l2ldl_imp = L.ldl_imp gen l2hdl_imp = L.hdl_imp gen l2tchol_imp = L.tchol_imp gen l2a1c_imp = L.a1c_imp gen l2sysbp_imp = L.sysbp_imp gen l2crp_imp = L.crp_imp gen l2logcrp_imp = log(l2crp_imp) gen l2a1c = L.a1c gen l2hdl = L.hdl gen l2tchol = L.tchol gen l2sysbp = L.sysbp gen l2ldl = L.ldl gen l2crp = L.crp gen l2logcrp = log(l2crp) gen logcrp = log(crp) *gen l4logcrp = log(l4crp) drop hdl_miss tchol_miss a1c_miss sysbp_miss crp_miss ** Create biomarker_re, biomarker_miss local bio hdl ldl tchol a1c sysbp crp foreach v of local bio { gen `v'_miss = (`v' == .) egen `v'_mean = mean(`v'), by(wave) gen `v'_re = `v' replace `v'_re = `v'_mean if `v' == . label var `v'_re "Recoded `v' by placing the mean for missing values (for model-validation purpose)" label var `v'_miss "Whether `v' value is missing (for model-validation purpose)" drop `v'_mean gen l2`v'_miss = (l2`v'== .) label var l2`v'_miss "Whether l2`v' value is missing (used in the transition model)" } *drop logcrp_miss l4logcrp_miss * Label the biomarker variables foreach bio of local biomarkers { label var pct_`bio' "Imputed percentile for `bio'" label var `bio'_imp "Imputed `bio' value " label var l2`bio' "2 year lag of NHANES adjusted `bio'" label var l2`bio'_imp "2 year lag of Imputed `bio' value" } drop *_tiles tchol_hdl tchol_hearte tchol_storke tchol_diabe tchol_hibpe tchol_cancre hdl_hearte hdl_storke hdl_diabe hdl_hibpe hdl_cancre rxchol_cancre compress save "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta", replace ** Delete the unnecessary interim data files erase "$dua_rand_hrs/bio_pct.dta" erase "$dua_rand_hrs/bio_grp.dta" erase "$dua_rand_hrs/bio_hrs_recoded.dta" erase "$dua_rand_hrs/bio_hrs_recoded_prepimp.dta" erase "$dua_rand_hrs/bio_hrs_recoded_prepimp_v1.dta" erase "$dua_rand_hrs/bio_imp20tiles_wide.dta" erase "$dua_rand_hrs/bio_imp_all.sas7bdat" erase "$dua_rand_hrs/bio_imp_finalcrp.sas7bdat" erase "$dua_rand_hrs/bio_imp_finalsysbp.sas7bdat" erase "$dua_rand_hrs/bio_imp_finala1c.sas7bdat" erase "$dua_rand_hrs/bio_imp_finaltchol.sas7bdat" erase "$dua_rand_hrs/bio_imp_finalhdl.sas7bdat" erase "$dua_rand_hrs/imputedcrp.sas7bdat" erase "$dua_rand_hrs/imputedsysbp.sas7bdat" erase "$dua_rand_hrs/imputeda1c.sas7bdat" erase "$dua_rand_hrs/imputedtchol.sas7bdat" erase "$dua_rand_hrs/imputedhdl.sas7bdat" erase "$dua_rand_hrs/bio_hrs_recoded_prepimp_v1.sas7bdat" erase "$dua_rand_hrs/bio_imp20tiles_wide.sas7bdat" ********************************************************************************* *** Descriptive stat ********************************************************************************* recode age (51/64=1) (65/80=2) (nonmiss=4), gen(agecat3) replace agecat3 = 3 if age > 80 recode agecat3 (4 = .) gen hearterx = 0 if hearte == 1 & rxchol == 0 replace hearterx = 1 if hearte == 1 & rxchol == 1 replace hearterx = 2 if hearte == 0 tab hearterx, m local bioimp hdl tchol a1c sysbp ldl crp logcrp foreach x of local bioimp { tabstat `x' [aw=biowgtr] if wave >=8 , by (agecat3) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (agecat3) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (raracem) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (raracem) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (male) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (male) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (educ) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (educ) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (smkstat) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (smkstat) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (wtstate) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (wtstate) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (hearterx) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (hearterx) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (diaberx) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (diaberx) stat (n mean median sd) long format tabstat `x' [aw=biowgtr] if wave >=8 , by (hibperx) stat (n mean median sd) long format tabstat `x' `x'_imp [aw=wtresp] if wave >=8 , by (hibperx) stat (n mean median sd) long format }