/*
The goal in this file is to estimate the inverse hyperbolic sine models using ghreg.  Then, we will use the residuals terms from those 
models in the cmp procedure.  After the CMP estimation, the models from ghreg will replace the residual models used in cmp.  

This will produce files for use in new51simulate: 
	incoming_vcmatrix.dta - the variance-covariance matrix
	incoming_means.dta - the beta coefficients for the models
	incoming_means_econ_tos.dta - the parameters for the inverse hyperbolic sine models
	incoming_cut_points.dta - the cut points for the ordered probit models

9/28/15 - removed AIME and quarters worked.  These are estimated in a separate joint estimation.

*/


quietly include common.do

* Install the cmp mata routine
quietly do "$local_path/utilities/cmp.mata"

* Include this since we need ghreg
adopath + "$local_path/hyp_mata"
adopath ++ "$local_path/utilities"

* use restricted data for 1992 50-55 cohort - (We no longer need this restricted data restriction ...)
* use $dua_rand_hrs/age5055_hrs1992r.dta
use $outdata/age5055_hrs1992.dta

* Set seed for repeatable results
 set seed 8675309


* Right-hand side variables for all models
local rhs black hispan hsless college male fsingle fwidowed flunge fcancre fstroke
* Shorter specification removing stroke for use in some models since it is so rare in the 5055 population
local rhs2 black hispan hsless college male fsingle fwidowed flunge fcancre

* Estimate the IHS models using ghreg

local ihs_var "hatotax iearnx dcwlthx"
local restrict "fwlth_nonzero work anydc"
local n : word count `ihs_var'


* An ordered list of the models estimated - not used in the estimation, but used in the saving of the parameters
#delimit ;
local models 		fhibpe 				fhearte 	fdiabe 		fanyhi 			fshlt 
								fwtstate 			fsmkstat 	fanyadl		fanyiadl		fwork
 								fwlth_nonzero 										hatotax 		iearnx
 								dcwlthx 			fanydc 		fanydb 		frdb_ea_c 	frdb_na_c
								;
#delimit cr
local dim : word count `models'

* matrix for storing theta, omega, and SSR parameters
mat TOS = J(`n',3,.)

forvalues i = 1/`n' {
		local a : word `i' of `ihs_var'
		local b : word `i' of `restrict'	
		ghreg `a' black hispan hsless college male single widowed lunge cancre if `b' == 1
		
		* Store the estimates for theta, omega, and ssr for later in new51_simulate
		mat TOS[`i',1] = e(theta)
		mat TOS[`i',2] = e(omega)
		mat TOS[`i',3] = e(ssr)
		
		* Store the models so we can replace the cmp estimated models
		mat ghreg_`a' = e(b)
		mat rownames ghreg_`a' = `a' 
		mat list ghreg_`a'
		
		* Store  the SSR for later use
		local ssr_`a' = e(ssr)
		di "SSR = " `ssr_`a''
		
		* These are the predicted values g = xB (no error)
		predict p1_`a'
		sum p1_`a'
		
		* These are the simulated values g = xB + error (including a draw from a uniform distribution)
		predict p2_`a', simu
		sum p2_`a'

		* Store the parameters
		tempvar theta 
		tempvar omega
		gen `theta' = e(theta)
		gen `omega' = e(omega)
		di "Theta = " `theta' 
		di "Omega = " `omega'
		 
		* Variables/parameters for the transformation 
		gen term_`a' = `theta'*(`a'+`omega')
		tempvar term2
		gen `term2' = `theta'*`omega'
		
		* Generate the transformed outcome
		gen g_`a' = (ln(term_`a'+(1+(term_`a')^2)^0.5) - ln(`term2'+(1+(`term2')^2)^0.5)) / (`theta'*((1+(`term2')^2)^(-1/2)))
		sum g_`a'
		
		* Summarize the predicted value
		sum p1_`a'
		
		* Calculate the residuals
		gen res_`a' = g_`a' - p1_`a'
		sum res_`a'
		
		* This is a parameter used in the simulated prediction, needs to be dropped for subsequent estimations
		drop t
		
}

mat rownames TOS = `ihs_var'
mat colnames TOS = theta omega ssr
mat incoming_means_econ_tos = TOS		
matsave incoming_means_econ_tos, replace path("$indata") saving


* The approach for the ghreg models is to use the calculated residuals as the outcomes of a very simple OLS estimation:  
* 		res_hatotax = error 
* 		res_iearnx = error 
* 		res_dcwlthx = error 


* This is for the res_hatotax, res_iearnx, and res_dcwlthx cmp estimation where we only fit the error term.
gen c = 0

***************************************************************************************************************
/*
* CMP  -  
		options include:
		- tech = search algorithm (here, using dfp), 
		- nrtolerance = tolerance of steps (default is 10^-5, larger values give faster convergence (useful for testing))
		- ghkdraws = since this is a Monte Carlo simulation, we specify # of draws.  Ideally, ~100, smaller values run faster (useful for testing)
*/
*****************************************************************************************************************
cmp setup
which cmp

* Constraints
* work and earnings
constraint 1 [atanhrho_1013]_cons = 0
* work and anydc 
constraint 2 [atanhrho_1015]_cons = 0
* work and anydb
constraint 3 [atanhrho_1016]_cons = 0
* non-zero wealth and wealth
constraint 4 [atanhrho_1112]_cons = 0
* anydb and early db
constraint 5 [atanhrho_1617]_cons = 0
* anydb and normal db
constraint 6 [atanhrho_1618]_cons = 0
* fanydc and flogdcwlthx
constraint 7 [atanhrho_1415]_cons = 0
* work and flogdcwlthx?
constraint 8 [atanhrho_1014]_cons = 0
* work and frdb_ea_c?
constraint 9 [atanhrho_1017]_cons = 0
* work and frdb_na_c?
constraint 10 [atanhrho_1018]_cons = 0

* Additional constraints on the variance terms for ihs models
                                            

#delimit ;

cmp 				(fhibpe = `rhs2') 				(fhearte = `rhs2') 		(fdiabe = `rhs2') 			(fanyhi = `rhs2') 			(fshlt = `rhs2') 
 						(fwtstate = `rhs2') 			(fsmkstat = `rhs2') 	(fanyadl = `rhs2') 			(fanyiadl = `rhs2')			(fwork = `rhs2')
			 			(fwlth_nonzero = `rhs2')	  																						(res_hatotax = c) 			(res_iearnx = c)
						(res_dcwlthx = c)					(fanydc = `rhs2')			(fanydb = `rhs2') 			(frdb_ea_c = `rhs2') 		(frdb_na_c = `rhs2')
, indicators(
						$cmp_probit 				$cmp_probit 				$cmp_probit 						$cmp_probit 						$cmp_probit 
 					 	$cmp_oprobit 				$cmp_oprobit 				$cmp_probit 						$cmp_probit							$cmp_probit
 						$cmp_probit 																										fwlth_nonzero*$cmp_cont fwork*$cmp_cont
 						fanydc*$cmp_cont		fwork*$cmp_probit 	fwork*$cmp_probit 			fanydb*$cmp_oprobit 		fanydb*$cmp_oprobit
 						)   
 	 constraint(1/10) 
 	 tech(dfp) 
 	 nrtolerance(.01)
 	 ghkdraws(5)
 	 
;
 
#delimit cr


**************************************************************************************************** 
* Format and save the output
*******************************************************************************************************

* Build the Variance-Covariance matrix
mat VC = J(`dim',`dim',.)
* label the rows and colums
mat rownames VC = `models'
mat colnames VC = `models'

* Populate the variance terms - All are 1 except for continuous models.
forvalues x = 1/`dim' {
	capture: mat VC[`x',`x'] = (exp(_b[lnsig_`x':_cons]))^2
	if _rc > 0 { 
		mat VC[`x',`x'] = 1
	}
}

* Replace the variance terms for the ghreg estimations with the SSR terms
forvalues i = 1/`n' {
	local a : word `i' of `ihs_var'
	local r = rownumb(matrix(VC),"`a'")
	mat VC[`r',`r'] = `ssr_`a''
}

* Populate the covariance terms
local oneless = `dim' - 1
forvalues x = 1/`oneless' {
		local a = `x' + 1
		forvalues y = `a'/`dim' {
			mat VC[`y',`x'] = tanh(_b[atanhrho_`x'`y':_cons])*sqrt(VC[`x',`x']*VC[`y',`y']) 
	}
}


* Save the VC matrix
mat incoming_vcmatrix = VC
matsave incoming_vcmatrix, replace path("$indata") saving


* Store the beta coefficients from the models (equivalent to old incoming_means.dta)

* Note: ordered probit models are estimated without a constant.  The cut points can be transformed by the 
* value of cut point 1 to be comparable to the model that included a constant.

local cov black hispan hsless college male fsingle fwidowed flunge fcancre _cons

local r_count : word count `models'
local c_count : word count `cov'

mat IM = J(`r_count',`c_count',.)

local a = 1

foreach x in `models' {
	local b = 1
	foreach y in `cov' {
		capture:  mat IM[`a',`b'] = _b[`x':`y']
		* This is to deal with no constant in the ordered probit estimations
		if _rc != 0 {
				mat IM[`a',`b'] = 0
			}
		local b = `b' + 1
		}
	local a = `a' + 1
}		

mat list IM

* Set the constant to the first cut point for the ordered probit models
foreach x in 6 7 17 18 {
	mat IM[`x',10] = -_b[cut_`x'_1:_cons]
}

mat list IM

mat rownames IM = `models' 
mat colnames IM = black hispan hsless college male single widowed lunge cancre constant 

* Replace the residual models with the ghreg models

forvalues i = 1/`n' {
	local a : word `i' of `ihs_var'
	local r = rownumb(matrix(IM),"`a'")
	forvalues y = 1/10 {
			mat IM[`r',`y'] = ghreg_`a'[1,`y']
	}
}


mat incoming_means = IM
matsave incoming_means, replace path("$indata") saving


* Make a matrix storing the cutpoints, shifting them so that the first cut point is 0

mat CP = J(4,4,.)

local a = 1
foreach x in 6 7 17 18 {
		forvalues y = 1/4 {
			capture mat CP[`a',`y'] = _b[cut_`x'_`y':_cons] - _b[cut_`x'_1:_cons]
		}
		local a = `a' + 1
}


mat colnames CP = cut_1 cut_2 cut_3 cut_4					
mat rownames CP = fwtstate fsmkstat frdb_ea_c frdb_na_c

mat incoming_cut_points = CP
matsave incoming_cut_points, replace path("$indata") saving


* Save the files in Stata 11 format
use $indata/incoming_vcmatrix.dta
saveold $indata/incoming_vcmatrix.dta, replace

use $indata/incoming_means.dta
saveold $indata//incoming_means, replace

use $indata/incoming_means_econ_tos.dta
saveold $indata//incoming_means_econ_tos.dta, replace

use $indata/incoming_cut_points.dta
saveold $indata/incoming_cut_points.dta, replace
   

capture log close