*************************************************************** *** This program prepared the data for imputation for SAS file *** 1. Aim to identify the missing pattern *** 2. Impute the missing data for the respondents with only two-wave missing *************************************************************** quietly include "../../../fem_env.do" use "$dua_rand_hrs/bio_hrs_recoded_prepimp.dta" , clear ** Create variables indicating the numbers of waves with missing biomarkers global biomarkers "hdl tchol a1c sysbp crp" foreach var of global biomarkers{ egen nomiss_`var' = tag(hhidpn wave `var') by hhidpn: egen `var'_num_nomiss = total(nomiss_`var') tab `var'_num_nomiss, m drop nomiss_`var' gen `var'_threewaves = `var'_num_nomiss == 3 gen `var'_twowaves = `var'_num_nomiss == 2 gen `var'_onewave = `var'_num_nomiss == 1 gen `var'_nowave = `var'_num_nomiss == 0 drop `var'_num_nomiss } ** generate a variable with combination of cancre stroke diabe rxchol save "$dua_rand_hrs/bio_hrs_recoded_prepimp_temp.dta" , replace ******************************** ***** Impute HDL ******************************** ** First, process the cases with THREE or TWO waves of hdl foreach bio of global biomarkers{ if "`bio'" == "hdl"{ local var "hearte" } else if "`bio'" == "tchol"{ local var "rxchol" } else if "`bio'" == "a1c"{ local var "diabe" } else if "`bio'" == "sysbp"{ local var "hibpe" } else if "`bio'" == "crp"{ local var "cancre" } use "$dua_rand_hrs/bio_hrs_recoded_prepimp_temp.dta" keep if `bio'_threewaves == 1 | `bio'_twowaves == 1 keep hhidpn wave pct_`bio' `var' reshape wide pct_`bio' `var', i(hhidpn) j(wave) egen `var'_miss = rmiss (`var'8 `var'9 `var'10 `var'11 `var'12) tab `var'_miss ** Identify whether they are in the same `var' group across five waves gen `bio'_samegroup = 0 replace `bio'_samegroup = 1 if (`var'8 == `var'9) & (`var'9 == `var'10) & (`var'10 == `var'11) & (`var'11 == `var'12) replace `bio'_samegroup = 1 if (`var'8 == `var'9 & `var'9 == `var'10 & `var'10 == `var'11) & `var'12 == . replace `bio'_samegroup = 1 if (`var'8 == `var'9 & `var'9 == `var'10 & `var'10 == `var'12) & `var'11 == . replace `bio'_samegroup = 1 if (`var'8 == `var'9 & `var'9 == `var'11 & `var'11 == `var'12) & `var'10 == . replace `bio'_samegroup = 1 if (`var'8 == `var'10 & `var'10 == `var'11 & `var'11 == `var'12) & `var'9 == . replace `bio'_samegroup = 1 if (`var'9 == `var'10 & `var'10 == `var'11 & `var'11 == `var'12 ) & `var'8 == . replace `bio'_samegroup = 1 if (`var'8 == `var'9 & `var'9 == `var'10) & `var'11 == . & `var'12 == . replace `bio'_samegroup = 1 if (`var'8 == `var'9 & `var'9 == `var'11) & `var'10 == . & `var'12 == . replace `bio'_samegroup = 1 if (`var'8 == `var'10 & `var'10 == `var'11) & `var'9 == . & `var'12 == . replace `bio'_samegroup = 1 if (`var'9 == `var'10 & `var'10 == `var'11) & `var'8 == . & `var'12 == . replace `bio'_samegroup = 1 if (`var'8 == `var'9 & `var'9 == `var'12) & `var'10 == . & `var'11 == . replace `bio'_samegroup = 1 if (`var'8 == `var'10 & `var'10 == `var'12) & `var'9 == . & `var'11 == . replace `bio'_samegroup = 1 if (`var'9 == `var'10 & `var'10 == `var'12) & `var'8 == . & `var'11 == . replace `bio'_samegroup = 1 if (`var'8 == `var'11 & `var'11 == `var'12) & `var'9 == . & `var'10 == . replace `bio'_samegroup = 1 if (`var'9 == `var'11 & `var'11 == `var'12) & `var'8 == . & `var'10 == . replace `bio'_samegroup = 1 if (`var'10 == `var'11 & `var'11 == `var'12) & `var'8 == . & `var'9 == . tab `bio'_samegroup , m gen impflag_`bio'8 = (pct_`bio'8 == .) gen impflag_`bio'9 = (pct_`bio'9 == .) gen impflag_`bio'10 = (pct_`bio'10 == .) gen impflag_`bio'11 = (pct_`bio'11 == .) gen impflag_`bio'12 = (pct_`bio'12 == .) forvalues i=8/12 { gen `bio'_twowaves_sas`i' = 1 if impflag_`bio'8 != 1 & impflag_`bio'9 == 1 & impflag_`bio'10 == 1 & impflag_`bio'11 == 1 & impflag_`bio'12 != 1 replace `bio'_twowaves_sas`i' = 1 if impflag_`bio'8 == 1 & impflag_`bio'9 == 1 & impflag_`bio'10 != 1 & impflag_`bio'11 == 1 & impflag_`bio'12 != 1 } save temp, replace replace pct_`bio'9 = (pct_`bio'8 + pct_`bio'10)/2 if pct_`bio'9 == . & (`bio'_samegroup == 1) & ( `var'8 == 0 & `var'9 == 0 & `var'10 == 0 ) replace pct_`bio'10 = (pct_`bio'9 + pct_`bio'11)/2 if pct_`bio'10 == . & (`bio'_samegroup == 1) replace pct_`bio'11 = (pct_`bio'10 + pct_`bio'12)/2 if pct_`bio'11 == . & (`bio'_samegroup == 1) replace pct_`bio'9 = pct_`bio'8 if pct_`bio'9 == . & `var'8 == 0 & `var'9 == 0 & `var'10 == 1 replace pct_`bio'9 = pct_`bio'10 if pct_`bio'9 == . & `var'8 == 0 & `var'9 == 1 & `var'10 == 1 replace pct_`bio'10 = pct_`bio'9 if pct_`bio'10 == . & `var'9 == 0 & `var'10 == 0 & `var'11 == 1 replace pct_`bio'10 = pct_`bio'11 if pct_`bio'10 == . & `var'9 == 0 & `var'10 == 1 & `var'11 == 1 replace pct_`bio'11 = pct_`bio'10 if pct_`bio'11 == . & `var'10 == 0 & `var'11 == 0 & `var'12 == 1 replace pct_`bio'11 = pct_`bio'12 if pct_`bio'11 == . & `var'10 == 0 & `var'11 == 1 & `var'12 == 1 replace pct_`bio'9 = (pct_`bio'8 + pct_`bio'10)/2 if pct_`bio'9 == . replace pct_`bio'10 = (pct_`bio'9 + pct_`bio'11)/2 if pct_`bio'10 == . replace pct_`bio'11 = (pct_`bio'10 + pct_`bio'12)/2 if pct_`bio'11 == . & pct_`bio'12 != . replace pct_`bio'11 = (pct_`bio'10 - pct_`bio'9) + pct_`bio'10 if pct_`bio'11 == . & pct_`bio'12 == . *replace pct_`bio'9 = (pct_`bio'10 - pct_`bio'11) + pct_`bio'10 if pct_`bio'9 == . & pct_`bio'8 == . *** If <1 or >50 - assume the same value as previous wave? replace pct_`bio'11 = pct_`bio'9 if (pct_`bio'11 < 1 | pct_`bio'11 > 50 ) & pct_`bio'12 == . *replace pct_`bio'9 = pct_`bio'10 if (pct_`bio'9 < 1 | pct_`bio'9 > 50) & pct_`bio'8 == . *** Impute data in wave 8 and 12 *** Linear extrapolation *** If <1 or >50 - assume the same value as previous wave? replace pct_`bio'8 = (pct_`bio'9 - pct_`bio'10) + pct_`bio'9 if pct_`bio'8 == . replace pct_`bio'12 = (pct_`bio'11 - pct_`bio'10) + pct_`bio'11 if pct_`bio'12 == . replace pct_`bio'8 = pct_`bio'9 if (pct_`bio'8 < 1 | pct_`bio'8 > 50 ) & pct_`bio'8 != . replace pct_`bio'12 = pct_`bio'11 if (pct_`bio'12 < 1 | pct_`bio'12 > 50) & pct_`bio'12 != . save temp1, replace ** Reshape from wide form to long form, in order to transfer to SAS file reshape long pct_`bio' `var' impflag_`bio' `bio'_twowaves_sas, i(hhidpn) j(wave) rename pct_`bio' pct_`bio'_imp save pct_`bio'_imp.dta, replace tabstat pct_`bio'_imp , by (`var') stat (n mean sd min max) tabstat pct_`bio'_imp if impflag_`bio' == 0, by (`var') stat (n mean sd min max) tabstat pct_`bio'_imp if impflag_`bio' == 1, by (`var') stat (n mean sd min max) } ********************************************* *** Merge four datasets to the master dataset ********************************************* use "$dua_rand_hrs/bio_hrs_recoded_prepimp_temp.dta" , clear merge 1:1 hhidpn wave using pct_hdl_imp.dta, keep (master match) nogen merge 1:1 hhidpn wave using pct_tchol_imp.dta, keep (master match) nogen merge 1:1 hhidpn wave using pct_a1c_imp.dta, keep (master match) nogen merge 1:1 hhidpn wave using pct_sysbp_imp.dta, keep (master match) nogen merge 1:1 hhidpn wave using pct_crp_imp.dta, keep (master match) nogen foreach bio of global biomarkers{ gen `bio'_twowaves_nosas = 1 if `bio'_twowaves== 1 & `bio'_twowaves_sas != 1 replace pct_`bio' = pct_`bio'_imp if `bio'_threewaves == 1 | `bio'_twowaves_nosas == 1 } drop pct_hdl_imp pct_tchol_imp pct_a1c_imp pct_sysbp_imp pct_crp_imp save "$dua_rand_hrs/bio_hrs_recoded_prepimp_v1.dta", replace *** check the distribution foreach bio of global biomarkers{ tabstat pct_`bio' , by (hearte) stat (n mean sd min max) tabstat pct_`bio' , by (rxchol) stat (n mean sd min max) tabstat pct_`bio' , by (agecat) stat (n mean sd min max) tabstat pct_`bio' , by (race) stat (n mean sd min max) tabstat pct_`bio' if impflag_`bio' == 0, by (hearte) stat (n mean sd min max) tabstat pct_`bio' if impflag_`bio' == 0, by (rxchol) stat (n mean sd min max) tabstat pct_`bio' if impflag_`bio' == 0, by (agecat) stat (n mean sd min max) tabstat pct_`bio' if impflag_`bio' == 0, by (race) stat (n mean sd min max) sum pct_`bio' if impflag_`bio' == 0, d sum pct_`bio' , d } ************************************************ ** Create the Wide data for SAS imputation macro ************************************************ *** Things need to be done before this - 1. Re-create the appropriate grouping variables and add it in the SAS sql step *** 2. Create another predicted pct_biomarker value by hhidpn-unique, not unique by hhidpn-wave, as we will use it in the SAS steps *** 3. Reshape the data to wide form *use "/sch-projects/public-data-projects/FEMcurrent/weihanch/PCSK9.branch/input_data/bio_hrs_recoded_prepimp_v1.dta", clear local rhs age black hispan male smkstat work widowed hsless college logbmi hearte stroke cancre lunge adlstat iadlstat rxchol hibperx diaberx fheart50 fdiabe50 fhibp50 fstrok50 tab wave xtset hhidpn wave foreach v of local rhs { gen `v'_m = missing(`v') gen L2`v' = l.`v' gen F2`v' = f.`v' egen `v'_mean = mean(`v'), by(wave) gen `v'_re = `v' replace `v'_re = l.`v' if missing(`v') replace `v'_re = f.`v' if missing(`v'_re) replace `v'_re = `v'_mean if missing(`v'_re) label var `v'_re "Recoded `v' by placing the lag-value for missing values" drop `v'_mean drop L2`v' F2`v' } collapse (mean) age_re black_re hispan_re male_re smkstat_re work_re widowed_re hsless_re college_re logbmi_re hearte_re stroke_re cancre_re lunge_re /// adlstat_re iadlstat_re rxchol_re hibperx_re diaberx_re /// fheart50_re fdiabe50_re fhibp50_re fstrok50_re /// hdl tchol a1c sysbp diabp pct_hdl pct_tchol pct_a1c pct_sysbp pct_crp , by (hhidpn) foreach bio of global biomarkers{ reg pct_`bio' age black hispan male smkstat work widowed hsless college logbmi hearte stroke cancre lunge adlstat iadlstat rxchol hibperx diaberx /// fheart50 fdiabe50 fhibp50 fstrok50 predict pct_`bio'_hat sum pct_`bio'_hat } /* ** place the population mean for missing values in RHS variables local rhs_missing age black hispan hearte stroke hibperx diaberx cancre lunge hsless college work widowed smkstat logbmi rxchol fheart50 fdiabe50 fhibp50 fstrok50 /// adlstat iadlstat foreach x of local rhs_missing { gen `x'_m = missing(`x') egen `x'_mean = mean(`x') gen `x'_re = `x' replace `x'_re = `x'_mean if `x' == . label var `x'_re "Recoded `v' by placing the population mean for missing values" drop `x'_mean } */ foreach bio of global biomarkers{ reg pct_`bio' age_re male black_re hispan_re smkstat_re work_re widowed_re hsless_re college_re logbmi_re hearte_re stroke_re cancre_re lunge_re adlstat_re iadlstat_re /// rxchol_re hibperx_re diaberx_re fheart50_re fdiabe50_re fhibp50_re fstrok50_re predict pct_`bio'_hat2 sum pct_`bio'_hat2 replace pct_`bio'_hat = pct_`bio'_hat2 sum pct_`bio'_hat2 sum pct_`bio'_hat } drop *_hat2 drop *_re save temp2, replace use "$dua_rand_hrs/bio_hrs_recoded_prepimp_v1.dta", clear replace pct_hdl = round(pct_hdl, 1) replace pct_tchol = round(pct_tchol, 1) replace pct_a1c = round(pct_a1c, 1) replace pct_sysbp = round(pct_sysbp, 1) replace pct_crp = round(pct_crp, 1) save, replace local hdl hdl hdl_nowave hdl_onewave hdl_twowaves hdl_twowaves_sas hdl_twowaves_nosas hdl_threewaves local tchol tchol tchol_nowave tchol_onewave tchol_twowaves tchol_twowaves_sas tchol_twowaves_nosas tchol_threewaves local a1c a1c a1c_nowave a1c_onewave a1c_twowaves a1c_twowaves_sas a1c_twowaves_nosas a1c_threewaves local sysbp sysbp sysbp_nowave sysbp_onewave sysbp_twowaves sysbp_twowaves_sas sysbp_twowaves_nosas sysbp_threewaves local crp crp crp_nowave crp_onewave crp_twowaves crp_twowaves_sas crp_twowaves_nosas crp_threewave keep hhidpn wave iwstat `hdl' `tchol' `a1c' `sysbp' `crp' pct* hearte rxchol hibpe diabe cancre stroke reshape wide iwstat `hdl' `tchol' `a1c' `sysbp' `crp' pct* hearte rxchol hibpe diabe cancre stroke , i(hhidpn) j(wave) foreach bio of global biomarkers{ egen `bio'_nowave = rowmax(`bio'_nowave8 `bio'_nowave9 `bio'_nowave10 `bio'_nowave11 `bio'_nowave12) egen `bio'_onewave = rowmax(`bio'_onewave8 `bio'_onewave9 `bio'_onewave10 `bio'_onewave11 `bio'_onewave12) egen `bio'_twowaves = rowmax(`bio'_twowaves8 `bio'_twowaves9 `bio'_twowaves10 `bio'_twowaves11 `bio'_twowaves12) egen `bio'_twowaves_sas = rowmax(`bio'_twowaves_sas8 `bio'_twowaves_sas9 `bio'_twowaves_sas10 `bio'_twowaves_sas11 `bio'_twowaves_sas12) egen `bio'_twowaves_nosas = rowmax(`bio'_twowaves_nosas8 `bio'_twowaves_nosas9 `bio'_twowaves_nosas10 `bio'_twowaves_nosas11 `bio'_twowaves_nosas12) egen `bio'_threewaves = rowmax(`bio'_threewaves8 `bio'_threewaves9 `bio'_threewaves10 `bio'_threewaves11 `bio'_threewaves12) drop `bio'_nowave8 `bio'_nowave9 `bio'_nowave1* `bio'_onewave8 `bio'_onewave9 `bio'_onewave1* `bio'_twowaves8 `bio'_twowaves9 `bio'_twowaves1* `bio'_twowaves_sas8 `bio'_twowaves_sas9 `bio'_twowaves_sas1* `bio'_twowaves_nosas8 `bio'_twowaves_nosas9 `bio'_twowaves_nosas1* `bio'_threewaves8 `bio'_threewaves9 `bio'_threewaves1* } merge 1:1 hhidpn using temp2.dta, keepusing (hhidpn pct_hdl_hat pct_tchol_hat pct_a1c_hat pct_sysbp_hat pct_crp_hat) keep (master match) nogen rename *8 *06 rename *9 *08 rename *10 *10 rename *12 *14 rename *11 *12 egen pct_hdl_sd = sd(pct_hdl_hat) egen pct_tchol_sd = sd(pct_tchol_hat) egen pct_a1c_sd = sd(pct_a1c_hat) egen pct_sysbp_sd = sd(pct_sysbp_hat) egen pct_crp_sd = sd(pct_crp_hat) order hhidpn hdl06 pct_hdl06 pct_hdl_hat06 pct_hdl_sd06 tchol06 pct_tchol06 pct_tchol_hat06 pct_tchol_sd06 a1c06 pct_a1c06 pct_a1c_hat06 pct_a1c_sd06 sysbp06 pct_sysbp06 pct_sysbp_hat06 pct_sysbp_sd06 /// crp06 pct_crp06 pct_crp_hat06 pct_crp_sd06 iwstat06 hibpe06 diabe06 hearte06 rxchol06 cancre06 /// hdl08 pct_hdl08 pct_hdl_hat08 pct_hdl_sd08 tchol08 pct_tchol08 pct_tchol_hat08 pct_tchol_sd08 a1c08 pct_a1c08 pct_a1c_hat08 pct_a1c_sd08 sysbp08 pct_sysbp08 pct_sysbp_hat08 pct_sysbp_sd08 /// crp08 pct_crp08 pct_crp_hat08 pct_crp_sd08 iwstat08 hibpe08 diabe08 hearte08 rxchol08 cancre08 /// hdl10 pct_hdl10 pct_hdl_hat10 pct_hdl_sd10 tchol10 pct_tchol10 pct_tchol_hat10 pct_tchol_sd10 a1c10 pct_a1c10 pct_a1c_hat10 pct_a1c_sd10 sysbp10 pct_sysbp10 pct_sysbp_hat10 pct_sysbp_sd10 /// crp10 pct_crp10 pct_crp_hat10 pct_crp_sd10 iwstat10 hibpe10 diabe10 hearte10 rxchol10 cancre10 /// hdl12 pct_hdl12 pct_hdl_hat12 pct_hdl_sd12 tchol12 pct_tchol12 pct_tchol_hat12 pct_tchol_sd12 a1c12 pct_a1c12 pct_a1c_hat12 pct_a1c_sd12 sysbp12 pct_sysbp12 pct_sysbp_hat12 pct_sysbp_sd12 /// crp12 pct_crp12 pct_crp_hat12 pct_crp_sd12 iwstat12 hibpe12 diabe12 hearte12 rxchol12 cancre12 /// hdl14 pct_hdl14 pct_hdl_hat14 pct_hdl_sd14 tchol14 pct_tchol14 pct_tchol_hat14 pct_tchol_sd14 a1c14 pct_a1c14 pct_a1c_hat14 pct_a1c_sd14 sysbp14 pct_sysbp14 pct_sysbp_hat14 pct_sysbp_sd14 /// crp14 pct_crp14 pct_crp_hat14 pct_crp_sd14 iwstat14 hibpe14 diabe14 hearte14 rxchol14 cancre14 save "$dua_rand_hrs/bio_imp20tiles_wide.dta", replace **** Final data file used by SAS - bio_imp20tiles_wide.dta **** Now the variables we will need for the next step - take HDL for example **** pct_hdl06 pct_hdl08 pct_hdl10 pct_hdl12 pct_hdl14 **** pct_hdl_hat pct_hdl_hat06 pct_hdl_hat08 pct_hdl_hat10 pct_hdl_hat12 pct_hdl_hat14 **** hdl_onewave hdl_twowaves hdl_twowaves_sas hdl_threewaves hdl_nowave **** hearte06 hearte08 hearte10 hearte12 hearte14 **** hibpe06 hibpe08 hibpe10 hibpe12 hibpe14 **** diabe06 diabe08 diabe10 diabe12 diabe14 **** rxchol06 rxchol08 rxchol10 rxchol12 rxchol14 erase pct_hdl_imp.dta erase pct_tchol_imp.dta erase pct_a1c_imp.dta erase pct_sysbp_imp.dta erase pct_crp_imp.dta erase temp.dta erase temp1.dta erase temp2.dta erase "$dua_rand_hrs/bio_hrs_recoded_prepimp_temp.dta"