************************************************************** * Wendy - 7/8/2016 - prepare the biomarker data for the missing-data imputation * This program created 50-quantiles for each biomarker and used the OLS models to generate the predicted quantile values * The predicted quantile values were used to identify the nearest neighbors, which would become the donors for the missing cases ************************************************************** quietly include "../../../fem_env.do" use "$dua_rand_hrs/bio_hrs_recoded.dta", clear foreach x in rxchol hibperx diaberx sysbp diabp { replace `x' = . if `x' < 0 } save "$dua_rand_hrs/bio_hrs_recoded.dta",replace ** Create percentiles for biomarker values, by years keep if wave >= 8 tab wave forvalues x = 8/12 { tempfile wave`x' use "$dua_rand_hrs/bio_hrs_recoded.dta", clear keep if wave == `x' xtile pct_hdl = hdl, nq(50) xtile pct_tchol = tchol, nq(50) xtile pct_a1c = a1c, nq(50) xtile pct_sysbp = sysbp, nq(50) xtile pct_crp = crp, nq(50) tab wave tabstat pct_hdl hdl, stat (n mean sd min max) tabstat pct_tchol tchol, stat (n mean sd min max) tabstat pct_a1c a1c, stat (n mean sd min max) tabstat pct_sysbp sysbp, stat (n mean sd min max) tabstat pct_crp crp, stat (n mean sd min max) save `wave`x'', replace } tempfile bio_pctile use `wave8' append using `wave9' `wave10' `wave11' `wave12' keep hhidpn wave pct_hdl pct_tchol pct_a1c pct_sysbp pct_crp save `bio_pctile', replace tabstat pct_hdl, by(wave) use "$dua_rand_hrs/bio_hrs_recoded.dta" keep if wave >= 8 merge 1:1 hhidpn wave using `bio_pctile', keep (master match) nogen tab wave bys wave: sum pct_hdl, d bys wave: sum pct_tchol, d bys wave: sum pct_a1c, d bys wave: sum pct_sysbp, d bys wave: sum pct_crp, d #d ; keep hhidpn wave iwstat biowgtr wtresp age agecat raracem widowed educ black hispan male smkstat smoken smokev hatota hitot work hsless college obese overwt bmi hearte stroke hibpe diabe cancre lunge fheart50 fstrok50 fdiabe50 fhibp50 adlstat iadlstat rxchol hibperx diaberx hdl tchol a1c sysbp diabp crp pct_hdl pct_tchol pct_a1c pct_sysbp pct_crp ; #d cr *xtset hhidpn wave sort hhidpn wave gen w8 = (wave == 8) gen w9 = (wave == 9) gen w10 = (wave == 10) gen w11 = (wave == 11) gen w12 = (wave == 12) gen logbmi = log(bmi) gen race = 1 replace race = 2 if black ==1 replace race = 3 if hispan ==1 gen agesq = age^2 mkspline age_l60 60 age_6070 70 age_7080 80 age80p = age ** save "$dua_rand_hrs/bio_hrs_recoded_imp20tiles.dta", replace **list hhidpn wave hdl pct_hdl tchol pct_tchol a1c pct_a1c sysbp pct_sysbp in 1/500, sepby(hhidpn wave) tabstat hdl pct_hdl tchol pct_tchol [aw=biowgtr], by (wave) tabstat a1c pct_a1c [aw=biowgtr], by (wave) tabstat sysbp pct_sysbp [aw=wtresp], by (wave) ** Model the percentile prediction set matsize 10000 local rhs age black hispan hearte stroke hibperx diaberx cancre lunge educ work widowed smoken smokev logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 adlstat iadlstat tab wave xtset hhidpn wave foreach v of local rhs { gen `v'_m = missing(`v') gen L2`v' = l.`v' gen F2`v' = f.`v' egen `v'_mean = mean(`v'), by(wave) gen `v'_re = `v' replace `v'_re = l.`v' if missing(`v') replace `v'_re = f.`v' if missing(`v'_re) replace `v'_re = `v'_mean if missing(`v'_re) label var `v'_re "Recoded `v' by placing the lag-value for missing values" drop `v'_mean drop L2`v' F2`v' } **#d ; gen rhs_miss = 0 foreach v of local rhs { replace rhs_miss = 1 if `v'_m == 1 } tab rhs_miss **#d cr ** HDL #d ; reg pct_hdl c.age_l60##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_6070##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_7080##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age80p##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) i.male##i.race##(i.educ i.work i.smoken i.smokev i.widowed c.logbmi) i.male##i.race##i.rxchol##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge i.fheart50 i.fdiabe50 i.fhibp50 i.adlstat i.iadlstat) i.fstrok50 w9 w10 w11 w12 ; predict pct_hdl_hat; sum pct_hdl_hat; sum pct_hdl; corr pct_hdl_hat pct_hdl; **** For those with missing in RHS variables; *** if missing(age race hearte stroke hibperx diaberx cancre lunge educ work widowed smoken smokev logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 adlstat iadlstat); reg pct_hdl age_re male c.black_re##c.educ_re c.hispan_re##c.educ_re smoken_re smokev_re work_re widowed_re logbmi_re hearte_re stroke_re cancre_re lunge_re fheart50_re fstrok50_re fdiabe50_re fhibp50_re adlstat_re iadlstat_re rxchol_re hibperx_re diaberx_re w9 w10 w11 w12 /*if rhs_miss == 1*/ ; predict pct_hdl_hat2 ; sum pct_hdl_hat2; replace pct_hdl_hat = pct_hdl_hat2 if pct_hdl_hat == . ; sum pct_hdl_hat; count if pct_hdl_hat == . ; drop pct_hdl_hat2 ; * Tchol; reg pct_tchol c.age_l60##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_6070##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_7080##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age80p##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) i.male##i.race##(i.educ i.work i.smoken i.smokev c.logbmi i.widowed) i.male##i.race##i.rxchol##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge i.fheart50 i.fdiabe50 i.fhibp50 i.adlstat ) i.fstrok50 w9 w10 w11 w12 ; predict pct_tchol_hat; sum pct_tchol_hat; sum pct_tchol; corr pct_tchol_hat pct_tchol; **** For those with missing in RHS variables; *** if missing(age race hearte stroke hibperx diaberx cancre lunge educ work widowed smoken smokev logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 adlstat iadlstat); reg pct_tchol age_re male c.black_re##c.educ_re c.hispan_re##c.educ_re smoken_re smokev_re work_re widowed_re logbmi_re hearte_re stroke_re cancre_re lunge_re fheart50_re fstrok50_re fdiabe50_re fhibp50_re adlstat_re iadlstat_re rxchol_re hibperx_re diaberx_re w9 w10 w11 w12 /*if rhs_miss == 1*/ ; predict pct_tchol_hat2 ; sum pct_tchol_hat2; replace pct_tchol_hat = pct_tchol_hat2 if pct_tchol_hat == . ; sum pct_tchol_hat; count if pct_tchol_hat == . ; drop pct_tchol_hat2 ; * HbA1c; reg pct_a1c c.age_l60##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_6070##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_7080##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age80p##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) i.male##i.race##(i.educ i.work i.smoken i.smokev c.logbmi i.widowed) i.male##i.race##i.rxchol##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge i.fheart50 i.fdiabe50 i.fhibp50 i.adlstat) i.fstrok50 w9 w10 w11 w12 ; predict pct_a1c_hat; sum pct_a1c_hat; sum pct_a1c; corr pct_a1c_hat pct_a1c; **** For those with missing in RHS variables; *** if missing(age race hearte stroke hibperx diaberx cancre lunge educ work widowed smoken smokev logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 adlstat iadlstat); reg pct_a1c age_re male c.black_re##c.educ_re c.hispan_re##c.educ_re smoken_re smokev_re work_re widowed_re logbmi_re hearte_re stroke_re cancre_re lunge_re fheart50_re fstrok50_re fdiabe50_re fhibp50_re adlstat_re iadlstat_re rxchol_re hibperx_re diaberx_re w9 w10 w11 w12 /*if rhs_miss == 1*/ ; predict pct_a1c_hat2 ; sum pct_a1c_hat2; replace pct_a1c_hat = pct_a1c_hat2 if pct_a1c_hat == . ; sum pct_a1c_hat; count if pct_a1c_hat == . ; drop pct_a1c_hat2 ; * Sysbp; reg pct_sysbp c.age_l60##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_6070##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_7080##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age80p##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) i.male##i.race##(i.educ i.work i.smoken i.smokev c.logbmi i.widowed) i.male##i.race##i.rxchol##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge i.fheart50 i.fdiabe50 i.fhibp50 i.adlstat ) i.fstrok50 w9 w10 w11 w12 ; predict pct_sysbp_hat; sum pct_sysbp_hat; sum pct_sysbp; corr pct_sysbp_hat pct_sysbp; **** For those with missing in RHS variables; *** if missing(age race hearte stroke hibperx diaberx cancre lunge educ work widowed smoken smokev logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 adlstat iadlstat); reg pct_sysbp age_re male c.black_re##c.educ_re c.hispan_re##c.educ_re smoken_re smokev_re work_re widowed_re logbmi_re hearte_re stroke_re cancre_re lunge_re fheart50_re fstrok50_re fdiabe50_re fhibp50_re adlstat_re iadlstat_re rxchol_re hibperx_re diaberx_re w9 w10 w11 w12 /*if rhs_miss == 1*/ ; predict pct_sysbp_hat2 ; sum pct_sysbp_hat2; replace pct_sysbp_hat = pct_sysbp_hat2 if pct_sysbp_hat == . ; sum pct_sysbp_hat; count if pct_sysbp_hat == . ; drop pct_sysbp_hat2 ; *** CRP ; reg pct_crp c.age_l60##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_6070##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age_7080##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) c.age80p##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge) i.male##i.race##(i.educ i.work i.smoken i.smokev i.widowed c.logbmi) i.male##i.race##i.rxchol##(i.hearte i.stroke i.hibperx i.diaberx i.cancre i.lunge i.fheart50 i.fdiabe50 i.fhibp50 i.adlstat i.iadlstat) i.fstrok50 w9 w10 w11 w12; predict pct_crp_hat; sum pct_crp_hat; sum pct_crp; corr pct_crp_hat pct_crp; **** For those with missing in RHS variables; *** if missing(age race hearte stroke hibperx diaberx cancre lunge educ work widowed smoken smokev logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 adlstat iadlstat); reg pct_crp age_re male c.black_re##c.educ_re c.hispan_re##c.educ_re smoken_re smokev_re work_re widowed_re logbmi_re hearte_re stroke_re cancre_re lunge_re fheart50_re fstrok50_re fdiabe50_re fhibp50_re adlstat_re iadlstat_re rxchol_re hibperx_re diaberx_re w9 w10 w11 w12 /*if rhs_miss == 1*/ ; predict pct_crp_hat2 ; sum pct_crp_hat2; replace pct_crp_hat = pct_crp_hat2 if pct_crp_hat == . ; sum pct_crp_hat; count if pct_crp_hat == . ; drop pct_crp_hat2 ; #d cr ** Generate the SD for predicted percentile foreach x in hdl tchol a1c sysbp crp { egen pct_`x'_sd = sd(pct_`x'_hat) } drop *_re *_m save "$dua_rand_hrs/bio_hrs_recoded_prepimp.dta", replace