quietly include ../../../fem_env.do


*** Assign actual value from the 50-quantiles

* Make datasets describing the biomarker distributions - do this by collapsing into 50 "equal"-sized groups

use "$dua_rand_hrs/bio_imp_all.dta" , clear 

keep hhidpn wave hdl pct_hdl tchol pct_tchol a1c pct_a1c sysbp pct_sysbp crp pct_crp

xtile hdl_grp = hdl, nq(50)
xtile tchol_grp = tchol, nq(50)
xtile a1c_grp = a1c, nq(50)
xtile sysbp_grp = sysbp, nq(50)
xtile crp_grp = crp, nq(50)

tabstat hdl, by(hdl_grp)
tabstat tchol, by(tchol_grp)
tabstat a1c, by(a1c_grp)
tabstat sysbp, by(sysbp_grp)
tabstat crp, by(crp_grp)

preserve

collapse (mean) hdl, by(hdl_grp)
rename hdl hdl_dist
gen n = _n
drop if hdl_grp == .
rename hdl_grp grp
list grp
tempfile hdl_grp
save `hdl_grp'

restore

preserve
collapse (mean) tchol, by(tchol_grp)
rename tchol tchol_dist
gen n = _n
drop if tchol_grp == .
rename tchol_grp grp
list grp
tempfile tchol_grp
save `tchol_grp'

restore

preserve 
collapse (mean) a1c, by(a1c_grp)
rename a1c a1c_dist
gen n = _n
drop if a1c_grp == .
rename a1c_grp  grp
list grp
tempfile a1c_grp
save `a1c_grp'
restore

preserve
collapse (mean) sysbp, by(sysbp_grp)
rename sysbp sysbp_dist
gen n = _n
drop if sysbp_grp == .
rename sysbp_grp grp
list grp
tempfile sysbp_grp
save `sysbp_grp'
restore

preserve
collapse (mean) crp, by(crp_grp)
rename crp crp_dist
gen n = _n
drop if crp_grp == .
rename crp_grp grp
list grp
tempfile crp_grp
save `crp_grp'


merge 1:1 grp using `hdl_grp', keepusing (grp hdl_dist) nogen
merge 1:1 grp using `tchol_grp' , keepusing (grp tchol_dist) nogen
merge 1:1 grp using `a1c_grp' , keepusing (grp a1c_dist) nogen
merge 1:1 grp using `sysbp_grp' , keepusing (grp sysbp_dist) nogen

save "$dua_rand_hrs/bio_grp.dta" , replace


	* Process the biomarker and reconstruct the distributions by interpolation
	use "$dua_rand_hrs/bio_grp.dta", replace
	* Move from the 50 categories to tenths of percentiles
		
	expand 20
	sort grp
	gen id = (_n-1)/10
	* Set values to be used for imputation at middle of each group
	replace hdl_dist = . if mod(id+1,2)>0
	replace tchol_dist = . if mod(id+1,2)>0
	replace a1c_dist = . if mod(id+1,2)>0
	replace sysbp_dist = . if mod(id+1,2)>0
	replace crp_dist = . if mod(id+1,2)>0
	
	* Set lowest values
	replace hdl_dist   = 12.11 if id == 0
	replace tchol_dist = 61.17 if id == 0
	replace a1c_dist   = 3 if id == 0
	replace sysbp_dist = 37.6 if id == 0
	replace crp_dist = 0.2 if id == 0
	
	* In case we need a 100th
	expand 2 if _n == _N
	replace id = 100 if _n == _N
	
	ipolate hdl_dist id, gen(hdl_imp) epolate
	ipolate tchol_dist id, gen(tchol_imp) epolate
	ipolate a1c_dist id, gen(a1c_imp) epolate
	ipolate sysbp_dist id, gen(sysbp_imp) epolate
	ipolate crp_dist id, gen(crp_imp) epolate
	
	gen hdl_tiles = id
	gen tchol_tiles = id
	gen a1c_tiles = id
	gen sysbp_tiles = id
	gen crp_tiles = id

	replace hdl_dist = hdl_imp
	replace tchol_dist = tchol_imp
	replace a1c_dist = a1c_imp
	replace sysbp_dist = sysbp_imp
	replace crp_dist = crp_imp
	
	keep grp hdl_tiles hdl_dist tchol_tiles tchol_dist a1c_tiles a1c_dist sysbp_tiles sysbp_dist crp_tiles crp_dist

	save "$dua_rand_hrs/bio_pct.dta", replace

	**********************************************
	* Convert these categories into continuous values, then assign to values
	* Get 50-55 2004 distribution and use this to convert categorical to continous (we have 1000 points in our distribution)
	**********************************************
	
	use "$dua_rand_hrs/bio_imp_all.dta", clear
	
	xtset hhidpn wave
	
		foreach var in hdl tchol a1c sysbp crp {
			
			gen l2pct_`var' = L.pct_`var'
			gen f2pct_`var' = F.pct_`var'

			
				replace pct_`var' = f2pct_`var' if pct_`var' == . & wave == 8
				replace pct_`var' = l2pct_`var' if pct_`var' == . & wave == 12
				

				** Fill the population mean value for the rest of missing obs
				egen mean_pct_`var' = mean(pct_`var'), by(wave)
				replace mean_pct_`var' = round(mean_pct_`var')
				replace pct_`var' = mean_pct_`var' if pct_`var' == . 
				
				
				drop l2pct_`var' f2pct_`var' mean_pct_`var'
				
				count if pct_`var' == .
}
	
	
	gen hdl_draw = runiform()
	gen tchol_draw = runiform()
	gen a1c_draw = runiform()
	gen sysbp_draw = runiform()
	gen crp_draw = runiform()
	
		
	gen hdl_tiles = round((100/50)*pct_hdl - (100/50)*hdl_draw,0.1)
  gen tchol_tiles = round((100/50)*pct_tchol - (100/50)*tchol_draw,0.1)
	gen a1c_tiles = round((100/50)*pct_a1c - (100/50)*a1c_draw,0.1)
	gen sysbp_tiles = round((100/50)*pct_sysbp - (100/50)*sysbp_draw,0.1)
	gen crp_tiles = round((100/50)*pct_crp - (100/50)*crp_draw,0.1)
	
	
	* Fill in the values
	merge m:1 hdl_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(hdl_dist) nogen
  merge m:1 tchol_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(tchol_dist) nogen
	merge m:1 a1c_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(a1c_dist) nogen
	merge m:1 sysbp_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(sysbp_dist) nogen
	merge m:1 crp_tiles using "$dua_rand_hrs/bio_pct.dta", keepusing(crp_dist) nogen
		
	sort hhidpn wave
	
	rename hdl_dist hdl_imp
	rename tchol_dist tchol_imp
	rename a1c_dist a1c_imp
	rename sysbp_dist sysbp_imp
	rename crp_dist crp_imp
	
  gen hdl_miss = (hdl == .)
	gen tchol_miss = (tchol == .)
	gen a1c_miss = (a1c == .)
	gen sysbp_miss = (sysbp == .)
	gen crp_miss = (crp == .)

  keep hhidpn wave pct_hdl hdl_tiles hdl_imp hdl_miss pct_tchol tchol_tiles tchol_imp tchol_miss pct_a1c a1c_tiles a1c_imp a1c_miss pct_sysbp sysbp_tiles sysbp_imp sysbp_miss pct_crp crp_tiles crp_imp crp_miss 
	
	merge 1:1 hhidpn wave using "$dua_rand_hrs/bio_hrs_recoded.dta", keep (master match) nogen
	
	replace hdl_imp = hdl if hdl_miss != 1
	replace tchol_imp = tchol if tchol_miss != 1
	replace a1c_imp = a1c if a1c_miss != 1
	replace sysbp_imp = sysbp if sysbp_miss != 1
	replace crp_imp = crp if crp_miss != 1
			
	** Test the distribution

	foreach var in hdl tchol a1c sysbp crp {
		sum `var' [aw=biowgtr], d
		sum `var'_imp [aw=wtresp], d
	} 
	

** Cap the values after the extrapolation

replace hdl_imp = 12 if hdl_imp < 12
replace hdl_imp = 146 if hdl_imp >= 146 & hdl_imp != .

replace tchol_imp = 61 if tchol_imp < 61
replace tchol_imp = 641 if tchol_imp >= 641 & tchol_imp != .

replace a1c_imp = 3 if a1c_imp < 3
replace a1c_imp = 17.6 if a1c_imp >= 17.6 & a1c_imp != .

replace sysbp_imp = 37 if sysbp_imp < 37
replace sysbp_imp = 227 if sysbp_imp >= 227 & sysbp_imp != .
 
	
save "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta", replace
	
	********************************************************
* Impute ldl data by NHANES, using imputed HDL and tchol
********************************************************

use "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta", clear

drop hdl tchol rxchol_cancre tchol_hdl tchol_hearte tchol_storke tchol_diabe tchol_hibpe tchol_cancre hdl_hearte hdl_storke hdl_diabe hdl_hibpe hdl_cancre

rename hdl_imp hdl
rename tchol_imp tchol

count if hdl != .
count if tchol != .

gen tchol_hdl  = tchol*hdl
gen tchol_hearte = tchol*hearte
gen tchol_storke = tchol*stroke
gen tchol_diabe = tchol*diabe
gen tchol_hibpe = tchol*hibpe
gen tchol_cancre = tchol*cancre

gen hdl_hearte = hdl*hearte
gen hdl_storke = hdl*stroke
gen hdl_diabe = hdl*diabe
gen hdl_hibpe = hdl*hibpe
gen hdl_cancre = hdl*cancre

gen rxchol_stroke = rxchol*stroke
gen rxchol_hearte = rxchol*hearte
gen rxchol_diabe  = rxchol*diabe
gen rxchol_cancre = rxchol*cancre


est use "$local_path/Estimates/nhanes_biomarkers/nhanes_ldl.ster"

predict ldl_imp, xb

sum ldl_imp [aw=wtresp], detail

est use "$local_path/Estimates/nhanes_biomarkers/nhanes_ldl_minimal.ster"

predict ldl_min, xb

replace ldl_imp = ldl_min if ldl_imp == .

replace ldl_imp = 10.06 if ldl_imp <= 10

keep hhidpn wave ldl_imp

tempfile ldl_imp
save `ldl_imp'

	
use "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta"

merge 1:1 hhidpn wave using `ldl_imp', keep (master match) nogen

save, replace

***************************************************************************************************
* Available imputed biomarker - ldl hdl tchol a1c sysbp crp
* Generate the biomarker_re as placing the population mean for the missing biomarkers
* Generate missing flag variables
* Generate l4 for all the variables
***************************************************************************************************


* Make a panel
xtset hhidpn wave


gen logcrp_imp = log(crp_imp)

* Gen 2 year lagged variable for imputed hdl tchol ldl a1c sysbp
gen l2ldl_imp = L.ldl_imp
gen l2hdl_imp = L.hdl_imp
gen l2tchol_imp = L.tchol_imp
gen l2a1c_imp = L.a1c_imp
gen l2sysbp_imp = L.sysbp_imp
gen l2crp_imp = L.crp_imp
gen l2logcrp_imp = log(l2crp_imp)

gen l2a1c = L.a1c
gen l2hdl = L.hdl
gen l2tchol = L.tchol
gen l2sysbp = L.sysbp
gen l2ldl = L.ldl
gen l2crp = L.crp
gen l2logcrp = log(l2crp)

gen logcrp = log(crp)
*gen l4logcrp = log(l4crp)

drop hdl_miss tchol_miss a1c_miss sysbp_miss crp_miss

** Create biomarker_re, biomarker_miss
local bio hdl ldl tchol a1c sysbp crp

foreach v of local bio {
gen `v'_miss = (`v' == .)

egen `v'_mean = mean(`v'), by(wave)
gen `v'_re = `v'

replace `v'_re = `v'_mean if `v' == .
label var `v'_re "Recoded `v' by placing the mean for missing values (for model-validation purpose)"
label var `v'_miss "Whether `v' value is missing (for model-validation purpose)"
drop `v'_mean

gen l2`v'_miss = (l2`v'== .)
label var l2`v'_miss "Whether l2`v' value is missing (used in the transition model)"


}

*drop logcrp_miss l4logcrp_miss

* Label the biomarker variables

foreach bio of local biomarkers {
	
	label var pct_`bio' "Imputed percentile for `bio'"
	label var `bio'_imp "Imputed `bio' value "
	label var l2`bio' "2 year lag of NHANES adjusted `bio'"
	label var l2`bio'_imp "2 year lag of Imputed `bio' value"
		
}

 
drop *_tiles tchol_hdl tchol_hearte tchol_storke tchol_diabe tchol_hibpe tchol_cancre hdl_hearte hdl_storke hdl_diabe hdl_hibpe hdl_cancre rxchol_cancre

compress
save "$dua_rand_hrs/bio_hrs_recoded_impfinal.dta", replace

** Delete the unnecessary interim data files

erase "$dua_rand_hrs/bio_pct.dta"
erase "$dua_rand_hrs/bio_grp.dta" 
erase "$dua_rand_hrs/bio_hrs_recoded.dta"
erase "$dua_rand_hrs/bio_hrs_recoded_prepimp.dta"
erase "$dua_rand_hrs/bio_hrs_recoded_prepimp_v1.dta"
erase "$dua_rand_hrs/bio_imp20tiles_wide.dta"
erase "$dua_rand_hrs/bio_imp_all.sas7bdat"
erase "$dua_rand_hrs/bio_imp_finalcrp.sas7bdat"
erase "$dua_rand_hrs/bio_imp_finalsysbp.sas7bdat"
erase "$dua_rand_hrs/bio_imp_finala1c.sas7bdat"
erase "$dua_rand_hrs/bio_imp_finaltchol.sas7bdat"
erase "$dua_rand_hrs/bio_imp_finalhdl.sas7bdat"
erase "$dua_rand_hrs/imputedcrp.sas7bdat"
erase "$dua_rand_hrs/imputedsysbp.sas7bdat"
erase "$dua_rand_hrs/imputeda1c.sas7bdat"
erase "$dua_rand_hrs/imputedtchol.sas7bdat"
erase "$dua_rand_hrs/imputedhdl.sas7bdat"
erase "$dua_rand_hrs/bio_hrs_recoded_prepimp_v1.sas7bdat"
erase "$dua_rand_hrs/bio_imp20tiles_wide.sas7bdat"


*********************************************************************************
***  Descriptive stat
*********************************************************************************

recode age (51/64=1) (65/80=2) (nonmiss=4), gen(agecat3)
replace agecat3 = 3 if age > 80
recode agecat3 (4 = .)

gen hearterx = 0 if hearte == 1 & rxchol == 0
replace hearterx = 1 if hearte == 1 & rxchol == 1
replace hearterx = 2 if hearte == 0

tab hearterx, m

local bioimp hdl tchol a1c sysbp ldl crp logcrp

foreach x of local bioimp {

tabstat `x'   [aw=biowgtr] if wave >=8 , by (agecat3) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (agecat3) stat (n mean median sd) long format

tabstat `x'   [aw=biowgtr] if wave >=8 , by (raracem) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (raracem) stat (n mean median sd) long format

tabstat `x'  [aw=biowgtr] if wave >=8 , by (male) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (male) stat (n mean median sd) long format

tabstat `x'   [aw=biowgtr] if wave >=8 , by (educ) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (educ) stat (n mean median sd) long format

tabstat `x'   [aw=biowgtr] if wave >=8 , by (smkstat) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (smkstat) stat (n mean median sd) long format

tabstat `x'  [aw=biowgtr] if wave >=8 , by (wtstate) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (wtstate) stat (n mean median sd) long format

tabstat `x'   [aw=biowgtr] if wave >=8 , by (hearterx) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (hearterx) stat (n mean median sd) long format

tabstat `x'   [aw=biowgtr] if wave >=8 , by (diaberx) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (diaberx) stat (n mean median sd) long format

tabstat `x'   [aw=biowgtr] if wave >=8 , by (hibperx) stat (n mean median sd) long format
tabstat `x' `x'_imp  [aw=wtresp] if wave >=8 , by (hibperx) stat (n mean median sd) long format

}