clear all
set maxvar 20000

********************************
********************************
* LOAD AND MERGE IN DATA
********************************
********************************
// add rahhidpn variable to tracker file

use "$rawdata/trk2022tr_r.dta"  // HRS 2018 tracker data, from HRS file (using 2020 version will work just fine)

// Capitalized varaibles to lowercase variables 
foreach var of varlist _all {
    local lower_var = lower("`var'")
    rename `var' `lower_var'
}

gen str rahhidpn=hhid+pn
sort rahhidpn
save "$dtafiles/HRSTracker",replace

*Use the RAND-HRS longitudinal file
use "$rawdata/randhrs1992_2018v2.dta"


/*Note that this file treats the 1993 AHEAD Survey and the 1994 HRS Survey as "wave 2".
Similarly, the 1995 AHEAD Survey and the 1996 HRS Survey are treated as "wave 3". 
It is possible to distinguish whether an observation occurred in one survey or the other. 
However, I choose to simply leave them combined. This should not present any issues, 
as we use a variable for age which is not dependent upon the "year" in which the 
observation was listed to have occurred, but rather the respondents age at the end of the interview. 
Furthermore, the designation of any observation as having occured in the HRS 2000, 2002, 2004, ect. 
is somewhat arbitrary. Interviews occur during both that even numbered year, as well as 
the odd numbered year after. For this reason, treating the AHEAD surveys as 
having occurred alongside the HRS surveys for the first two years should not present an issue.
*/


*Merge in the cleaned HRS Tracker data
*I use this to merge in first interview year

merge 1:1 rahhidpn using "$dtafiles/HRSTracker", nogen keepusing(firstiw exdeathyr aalive balive calive dalive ealive falive galive halive jalive kalive lalive malive nalive oalive palive qalive)


********************************

/*Note, *agey_e is age at the end of the given interview. It is the recommended 
age variable by RAND-HRS. While most of the recent interviews start and end on
the same day, earlier on in the survey's history, some interviews went on over 
the span of multiple days (during which time a respondent's age in years may change).
Furthermore, using this age variable is superior to using Year minus  as it
reflects the fact that interviews are conducted during different times in the year.
*/
********************************


********************************
********************************
* LOAD IN AND NAME VARIABLES
********************************
********************************

gen VitalStatus1=aalive
gen VitalStatus2=.
replace VitalStatus2=balive if balive~=.
replace VitalStatus2=calive if calive~=.
gen VitalStatus3=.
replace VitalStatus3=dalive if dalive~=.
replace VitalStatus3=ealive if ealive~=.
gen VitalStatus4=falive
gen VitalStatus5=galive
gen VitalStatus6=halive
gen VitalStatus7=jalive
gen VitalStatus8=kalive
gen VitalStatus9=lalive
gen VitalStatus10=malive
gen VitalStatus11=nalive
gen VitalStatus12=oalive
gen VitalStatus13=palive
gen VitalStatus14=qalive


*****************************
*****************************
* MODIFYING THE VIGOROUS EXERCISE VARIABLES
*****************************
*****************************

*Exercise variables

/*The questions regarding exercise change quite dramatically over the course of
the HRS. Due to different phrasings of the question, two seemingly equivalent 
responses about exercise habits could be indicative of very different levels of 
exercise. For the time being, I code the vigorous exercise variable so that a 
response which indicates anything more than the lowest amount of exercise 
(typically none or less than three times a week) is 0 and anything higher is 1. 
This creates the most consistent responses among respondents between waves. */


*In 1992, the question regarding vigorous exercise is as follows:

/*
1) How often do you
participate in vigorous physical exercise or sports
-- such as aerobics, running, swimming, or
bicycling?  (Would you say 3 or more times a week,
1 or 2 times a week, 1 to 3 times a month, less than
once a month, or never?)  [IMPUTED]

V513 Code  Frequency
			--------------------
					1       1609
					2       1239
					3       1034
					4       2424
					5       6346
*/

	
gen vigact1 = .
replace vigact1 = 0 if r1vgactf == 5
replace vigact1 = 1 if r1vgactf >= 1 & r1vgactf<=4
	
	
*In 1994, the question regarding vigorous exercise is as follows (the question was not asked in the 1993 AHEAD study):

/*
W460    B43.    How often do you participate in vigorous physical
				activity or sports--such as heavy housework,
				aerobics, running, swimming, or bicycling?
		____________________________________________________________

				000.    None/never
						[B42: GO TO B43]
						[B43: GO TO B45]
				1-993.  Number of times
				994.    994+ (specify)
				995.    [B42: "All the time"; "All day at work"]
						[B43: "Job requires vigorous activity
						every day"]
						[B42: GO TO B43]
						[B43: GO TO B45]

				997.    Other (specify)
						[B42: GO TO B43]
						[B43: GO TO B45]

				998.    Don't Know/Not Ascertained; DK/NA
						[B42: GO TO B43]
						[B43: GO TO B45]
				999.    Refused; RF
						[B42: GO TO B43]
						[B43: GO TO B45]

				996.    Inap.
						Proxy interview for deceased Wave-1 R

		Variable    N       Mean     Std Dev     Minimum     Maximum
		------------------------------------------------------------
		W458    11596         55         220           0         999
		W460    11596         46         207           0         999



	W459    B42a.   (Was that per week, month, year, day, or what?)
	W461    B43a.   (Was that per week, month, year, day, or what?)
			____________________________________________________________

					02.     Week
					04.     Month
					06.     Year
					07.     Other (specify)
					11.     Day

					98.     Don't Know/Not Ascertained; DK/NA
					99.     Refused; RF

					00.     Inap.
							Proxy interview for deceased Wave-1 R
							[B42a: or B42=995-999]
							[B43a: or B43=995-999]

			W459 Code     Frequency
			-----------------------
					0          2043
					2          5478
					4           323
					6           105
					7            12
					8             2
				   11          3629
				   97             2
				   98             2

			W461 Code     Frequency
			-----------------------
					0          5757
					2          4017
					4           779
					6           175
					7            52
					8             1
					9             1
				   11           810
				   98             1
				   99             3

*/

merge 1:1 rahhidpn using "$rawdata/h94f1a.dta", keepusing(w460 w461) nogen
gen NumTimesVigEx2=w460
gen TimeFrameVigEx2=w461
gen vigact2=.

*Using the below code, anyone who responds that they ever do vigorous 
*exercise are coded as 1 and those who do no vigorous exercise are 
*coded as 0:
	
replace vigact2=1 if (NumTimesVigEx2>=1 & NumTimesVigEx2<=995)
replace vigact2=0 if (NumTimesVigEx2==0)

*The below code will make vigact2 be 1 for those who exercised 3 times 
*a week or more and 0 for those who exercised less. Note, while this 
*is the coding used by RAND (to attempt to mirror other versions of 
*the vigorous exercise question), it is not necessarily ideal. It 
*leads to a much lower response rate of having particpated in 
*vigorous exercise than in the years 1995 - 2002:

*replace vigact2 = r2vigact_orig


*For 1995 AHEAD, 1996 HRS, 1998, 2000 and 2002, the question regarding vigorous exercise is as follows:

/*
D934      B19Q. VIGOROUS EXERCISE                   
		  Section: B            Level: Respondent      CAI Reference: Q934
		  Type: Numeric         Width: 1               Decimals: 0

		  B19q. On average over the last 12 months have you participated in vigorous
		  physical activity or exercise three times a week or more?

		  By vigorous physical activity, we mean things like, sports, heavy housework,
		  or a job that involves physical labor.
		  ................................................................................
		   2136         1. YES
		   4889         5. NO
						7. Other
			  2         8. DK (don't know); NA (not ascertained)
						9. RF (refused)
*/

gen vigact3 = r3vigact
gen vigact4 = r4vigact
gen vigact5 = r5vigact
gen vigact6 = r6vigact


*For 2004 through 2018, the vigorous exercise question becomes rvgactx* instead of rvigact*


*For 2004 through 2018, the question regarding vigorous exercise is as follows:
	
		/*
		
		JC223    HOW OFTEN VIGOROUS ACTIVITY
		 Section: C     Level: Respondent      Type: Numeric    Width: 1   Decimals: 0
		 CAI Reference: SecC.Disease.C223_VigAct


		We would like to know the type and amount of physical activity involved in
		your daily life. How often do you take part in sports or activities that are
		vigorous, such as running or jogging, swimming, cycling, aerobics or gym
		workout, tennis, or digging with a spade or shovel: more than once a week,
		once a week, one to three times a month, or hardly ever or never?
		..................................................................................
		 4169           1. MORE THAN ONCE A WEEK
		 1590           2. ONCE A WEEK
		 1464           3. ONE TO THREE TIMES A MONTH
		12501           4. HARDLY EVER OR NEVER
		  385           7. (VOL) EVERY DAY
		   17           8. DK (Don't Know); NA (Not Ascertained)
			3           9. RF (Refused)
					Blank. INAP (Inapplicable)
		*/

*We recode the values of vigorous activity between 2004 and 2018 (vgactx*) to be 1 if the respondent vigorously exercises at all and 0 if not
forvalues wv=7(1)14{

gen	vigact`wv'=.
replace vigact`wv'=1 if r`wv'vgactx==1 | r`wv'vgactx==2 | r`wv'vgactx==3 | r`wv'vgactx==4
replace vigact`wv'=0 if r`wv'vgactx==5
}

*Drop vgactx as it is no longer needed
drop *vgactx*

** in what follows keep only vigact* and not the original RAND-HRS rvigact

*****************************
*****************************
* LOADING IN ADLs/IADLs
*****************************
*****************************

* The variables used in the construction of the frailty index are available for 
* waves 3 through 13, as the ADL/IADLS were not asked in the first two waves and
* the cognition data is not yet available for 2018. 2018/wave 14 can still be 
* included, but cognition score will be missing for every respondent

* Note that some of the interviews in wave 3 were conducted in 1995 as a part of
* the AHEAD survey. Because we determine age for a particular interview based on
* a seperate variable, *agey_e (age in years at the end of the interview), 
* this should not pose a problem

*Specify that the cognition score for 2018 is missing
gen cogtot2018=.


keep rahhidpn r*smokev r*nhmliv r*issdi r*isdi r*sayret r*isret r*work ///
r*mstat s*iwstat r*eata r*dressa r*beda r*toilta r*batha r*walkra r*walksa r*phonea /// 
r*moneya r*shopa r*mealsa r*chaira r*stoopa r*lifta r*mapa r*medsa r*clim1a r*dimea /// 
r*armsa r*pusha r*hibpe r*diabe r*cancre r*lunge r*hearte r*stroke r*psyche r*arthre ///
r*bmi r*smokev r*back r*doctor r*doctim r*hosp r*homcar r*nrshom r*cogtot r*cesd /// 
r*shlt ragender raracem renhmliv renhmday renrstim rahispan raedyrs raedegrm ///
rabyear radyear radmonth raddate radtimtdth rameduc rafeduc  r*smoken* r*bmi /// 
r*agey_e h*cpl firstiw vigact* inw* r*oopmd r*oopmdf VitalStatus* exdeathyr ///
r*iearn s*iearn r*iwstat r*higov r*govmd r*govmr r*mrprem r*rxprem r*prprm1 ///
r*sayret s*sayret /// /*self-declaration of retirement*/  
h*atotb ///       /*household total assets*/ 
r*ipena s*ipena r*isret s*isret r*igxfr s*igxfr r*isdi s*isdi r*iunwc s*iunwc /// /* respondent and spouse incomes, various sources */
h*icap h*iothr h*itot h*ifssi h*ifcap h*ifothr h*iftot  ///    /* household income, various sources */
r*lbrf s*lbrf r*retyr s*retyr r*slfemp s*slfemp ///             /* work status*/
r*prprm2 r*prprm3 r*prpcnt ///
remstat r*imrc r*dlrc r*ser7 r*bwc20 r*pres r*vp r*mo r*dy r*dw r*yr r*cact r*scis ///
r*walk1a r*sita r*climsa ///
rassageb rabmonth r*iwendm r*agem_e r*nrstim r*nrsnit r*nhmday

/* we don't include r*sleepe (only available since wave 13), r*memrye (only waves 4-9), 
r*alzhee (only since wave 10), r*demene (only since wave 10) */

*drop unnecessary variables
drop *pmbmi


*****************************
*****************************
* MAKE BINARY VARIABLE INDICATING OBESITY
*****************************
*****************************


forvalues wv=1(1)14{
	
	gen byte r`wv'bmigte30=.
	replace r`wv'bmigte30=0 if (r`wv'bmi<30 & r`wv'bmi~=.)
	replace r`wv'bmigte30=1 if (r`wv'bmi>=30 & r`wv'bmi~=.)

}

*****************************
*****************************
* MAKE BINARY VARIABLE INDICATING IF SRHS IS POOR OR FAIR
*****************************
*****************************

forvalues wv=3(1)14{	
	gen r`wv'shltfairpoor=.
	replace r`wv'shltfairpoor=0 if (r`wv'shlt==1|r`wv'shlt==2|r`wv'shlt==3)
	replace r`wv'shltfairpoor=1 if (r`wv'shlt==4|r`wv'shlt==5)
	
}

*****************************
*****************************
* CREATE ADJUSTED COGTOT VARIABLE WHERE 0 IS BEST AND 1 IS WORST
*****************************
*****************************

forvalues wv=3(1)14{		
	gen cogtotadj`wv'=(35-r`wv'cogtot)/35
}

*****************************
*****************************
* CREATE ADJUSTED SCORES FOR CONTINUOUS VARIABLES
*****************************
*****************************

forvalues wv=2(1)14{		
	gen cesdadj`wv'=(r`wv'cesd)/8
}

*****************************
*****************************
* CLARIFY WHY A RESPONDENT CHOOSES "don't do" FOR THOSE ADL/IADLs WHICH HAVE A FOLLOW UP QUESTION
*****************************
*****************************

* Now merge in some individual variables from the RAND-HRS files. 
* This is because some of the deficits have many missing responses otherwise. 
* For example, many people respond "don't do" when asked if they have difficulty
* preparing meals, going grocery shopping, using the phone, or taking medication. 
* Follow-up questions are employed to identify whether one's not doing an ADL/IADL 
* is the result of a health problem

merge 1:1 rahhidpn using "$rawdata/ad95f2b.dta", keepusing(d2023 d2028 d2033 d2038) nogen

gen whydontmeals1995=d2023
gen whydontgroc1995=d2028
gen whydontphone1995=d2033
gen whydontmed1995=d2038

drop d2023 d2028 d2033 d2038

***

merge 1:1 rahhidpn using "$rawdata/h96f4a.dta", keepusing(e2038 e2043 e2048 e2053) nogen

gen whydontmeals1996=e2038
gen whydontgroc1996=e2043
gen whydontphone1996=e2048
gen whydontmed1996=e2053

drop e2038 e2043 e2048 e2053

***

merge 1:1 rahhidpn using "$rawdata/hd98f2c.dta", keepusing(f2564 f2569 f2574 f2579) nogen

gen whydontmeals1998=f2564
gen whydontgroc1998=f2569
gen whydontphone1998=f2574
gen whydontmed1998=f2579

drop f2564 f2569 f2574 f2579

***

merge 1:1 rahhidpn using "$rawdata/h00f1d.dta", keepusing(g2862 g2867 g2867 g2872 g2877 g2876) nogen

gen whydontmeals2000=g2862
gen whydontgroc2000=g2867
gen whydontphone2000=g2872
gen whydontmed2000=g2877
gen ifneededmed2000=g2876

drop g2862 g2867 g2872 g2877 g2876

***

merge 1:1 rahhidpn using "$rawdata/h02f2c.dta", keepusing(hg042 hg045 hg048 hg050 hg051 hg052 hg060) nogen

gen whymeals2002=hg042
gen whygroc2002=hg045
gen whyphone2002=hg048
gen whymed2002=hg052
gen ifneededmed2002=hg051
gen whymoney2002=hg060

drop hg042 hg045 hg048 hg050 hg051 hg060

***

merge 1:1 rahhidpn using "$rawdata/h04f1c.dta", keepusing(jg042 jg045 jg048 jg052 jg051 jg060) nogen

gen whymeals2004=jg042
gen whygroc2004=jg045
gen whyphone2004=jg048
gen whymed2004=jg052
gen ifneededmed2004=jg051
gen whymoney2004=jg060


drop jg042 jg045 jg048 jg052 jg051 jg060

***

merge 1:1 rahhidpn using "$rawdata/h06f4a.dta", keepusing(kg042 kg045 kg048 kg052 kg051 kg060) nogen

gen whymeals2006=kg042
gen whygroc2006=kg045
gen whyphone2006=kg048
gen whymed2006=kg052
gen ifneededmed2006=kg051
gen whymoney2006=kg060

drop kg042 kg045 kg048 kg052 kg051 kg060

***

merge 1:1 rahhidpn using "$rawdata/h08f3a.dta", keepusing(lg042 lg045 lg048 lg052 lg051 lg060) nogen

gen whymeals2008=lg042
gen whygroc2008=lg045
gen whyphone2008=lg048
gen whymed2008=lg052
gen ifneededmed2008=lg051
gen whymoney2008=lg060

drop lg042 lg045 lg048 lg052 lg051 lg060

***

merge 1:1 rahhidpn using "$rawdata/hd10f6a.dta", keepusing(mg042 mg045 mg048 mg052 mg051 mg060) nogen

gen whymeals2010=mg042
gen whygroc2010=mg045
gen whyphone2010=mg048
gen whymed2010=mg052
gen ifneededmed2010=mg051
gen whymoney2010=mg060

drop mg042 mg045 mg048 mg052 mg051 mg060

***

merge 1:1 rahhidpn using "$rawdata/h12f3a.dta", keepusing(ng042 ng045 ng048 ng052 ng051 ng060 ) nogen

gen whymeals2012=ng042
gen whygroc2012=ng045
gen whyphone2012=ng048
gen whymed2012=ng052
gen ifneededmed2012=ng051
gen whymoney2012=ng060


drop ng042 ng045 ng048 ng052 ng051 ng060

***

merge 1:1 rahhidpn using "$rawdata/h14f2b.dta", keepusing(og042 og045 og048 og052 og051 og060) nogen

gen whymeals2014=og042
gen whygroc2014=og045
gen whyphone2014=og048
gen whymed2014=og052
gen ifneededmed2014=og051
gen whymoney2014=og060

drop og042 og045 og048 og052 og051 og060

***

merge 1:1 rahhidpn using "$rawdata/h16f2c.dta", keepusing(pg042 pg045 pg048 pg052 pg051 pg060) nogen

gen whymeals2016=pg042
gen whygroc2016=pg045
gen whyphone2016=pg048
gen whymed2016=pg052
gen ifneededmed2016=pg051
gen whymoney2016=pg060

drop pg042 pg045 pg048 pg052 pg051 pg060

***


*merge 1:1 rahhidpn using "$dtafiles/h18e1a.dta", keepusing(qg042 qg045 qg048 qg052 qg051 qg060) nogen
merge 1:1 rahhidpn using "$rawdata/h18f2b.dta", keepusing(qg042 qg045 qg048 qg052 qg051 qg060) nogen

gen whymeals2018=qg042
gen whygroc2018=qg045
gen whyphone2018=qg048
gen whymed2018=qg052
gen ifneededmed2018=qg051
gen whymoney2018=qg060

drop qg042 qg045 qg048 qg052 qg051 qg060 

*Here we group in the 1995 respondents of the 1995 AHEAD survey with the 1996 HRS Survey

replace r3mealsa=1 if whydontmeals1995==1
replace r3mealsa=0 if whydontmeals1995==5

replace r3shopa=1 if whydontgroc1995==1
replace r3shopa=0 if whydontgroc1995==5

replace r3phonea=1 if whydontphone1995==1
replace r3phonea=0 if whydontphone1995==5

replace r3medsa=1 if whydontmed1995==1
replace r3medsa=0 if whydontmed1995==5

***

replace r3mealsa=1 if whydontmeals1996==1
replace r3mealsa=0 if whydontmeals1996==5

replace r3shopa=1 if whydontgroc1996==1
replace r3shopa=0 if whydontgroc1996==5

replace r3phonea=1 if whydontphone1996==1
replace r3phonea=0 if whydontphone1996==5

replace r3medsa=1 if whydontmed1996==1
replace r3medsa=0 if whydontmed1996==5

***

replace r4mealsa=1 if whydontmeals1998==1
replace r4mealsa=0 if whydontmeals1998==5

replace r4shopa=1 if whydontgroc1998==1
replace r4shopa=0 if whydontgroc1998==5

replace r4phonea=1 if whydontphone1998==1
replace r4phonea=0 if whydontphone1998==5 

replace r4medsa=1 if whydontmed1998==1
replace r4medsa=0 if whydontmed1998==5
replace r4medsa=0 if r4medsa==.z

*.z is "don't do, no if did" specifically for the medsa variable
*Reference page 792 in the RAND HRS Longitudinal File 2018 (V1) Documentation

replace r5mealsa=1 if whydontmeals2000==1
replace r5mealsa=0 if whydontmeals2000==5

replace r5shopa=1 if whydontgroc2000==1
replace r5shopa=0 if whydontgroc2000==5

replace r5phonea=1 if whydontphone2000==1
replace r5phonea=0 if whydontphone2000==5

replace r5medsa=1 if whydontmed2000==1
replace r5medsa=0 if whydontmed2000==5

replace r5medsa=1 if ifneededmed2000==1
replace r5medsa=0 if ifneededmed2000==5
replace r5medsa=0 if r5medsa==.z


***
local wv=6
forvalues yr=2002(2)2018 {

replace r`wv'mealsa=1 if whymeals`yr'==1
replace r`wv'mealsa=0 if whymeals`yr'==5

replace r`wv'shopa=1 if whygroc`yr'==1
replace r`wv'shopa=0 if whygroc`yr'==5

replace r`wv'phonea=1 if whyphone`yr'==1
replace r`wv'phonea=0 if whyphone`yr'==5

replace r`wv'medsa=1 if whymed`yr'==1
replace r`wv'medsa=0 if whymed`yr'==5

replace r`wv'medsa=1 if ifneededmed`yr'==1 
replace r`wv'medsa=0 if ifneededmed`yr'==5
replace r`wv'medsa=0 if r`wv'medsa==.z

replace r`wv'moneya=1 if whymoney`yr'==1 
replace r`wv'moneya=0 if whymoney`yr'==5

local wv=`wv'+1
}

*** Order and sort the dataset 
order rahhidpn
sort rahhidpn

*** Keep variables we need

keep rahhidpn r*smokev r*nhmliv r*issdi r*isdi r*sayret r*isret /// 
r*work r*mstat s*iwstat r*eata r*dressa r*beda r*toilta r*batha r*walkra ///
r*walksa r*phonea r*moneya r*shopa r*mealsa r*chaira r*stoopa r*lifta r*mapa ///
r*medsa r*clim1a r*dimea r*armsa r*pusha r*hibpe r*diabe r*cancre r*lunge ///
r*hearte r*stroke r*psyche r*arthre r*bmi r*bmigte30 r*back r*doctor r*doctim ///
r*hosp r*homcar r*nrshom r*cogtot cogtotadj* cesdadj* r*shlt r*shltfairpoor ///
ragender raracem renhmliv renhmday renrstim rahispan raedegrm raedyrs rabyear ///
radyear radmonth raddate radtimtdth r*smokev r*smoken  r*agey_e h*cpl firstiw ///
vigact* inw* r*oopmd r*oopmdf VitalStatus* exdeathyr r*iearn s*iearn r*iwstat ///
r*sayret s*sayret /// /*self-declaration of retirement*/  
h*atotb ///       /*household total assets*/ 
r*ipena s*ipena r*isret s*isret r*igxfr s*igxfr r*isdi s*isdi r*iunwc s*iunwc /// /* respondent and spouse incomes, various sources */
h*icap h*iothr h*itot h*ifssi h*ifcap h*ifothr h*iftot  ///    /* household income, various sources */
r*lbrf s*lbrf r*retyr s*retyr r*slfemp s*slfemp ///             /* work status*/
r*higov r*govmd r*govmr r*mrprem r*rxprem r*prprm1 r*prprm2 r*prprm3 r*prpcnt ///
remstat r*imrc r*dlrc r*ser7 r*bwc20 r*pres r*vp r*mo r*dy r*dw r*yr r*cact r*scis ///
r*walk1a r*sita r*climsa ///
rassageb rabmonth r*iwendm r*agem_e r*nrstim r*nrsnit r*nhmday




*****************
*****************
* IMPUTE MISSING VALUES BASED ON PRESENCE OF IDENTICAL VALUES IN BOTH NEIGHBORING YEARS
*****************
*****************

/* To reduce the number of missing values, we do the following: 
We replace all missing values for observation-years which are between two 
observation-years which have the same value with that value. 
This means that if, for instance, in 1994 and 1998 a respondent reported 
difficulty with eating, but in 1996 this variable has a missing value,
the value in 1996 will be recoded such that they reported difficulty for that year, too */

local corrigenda eata dressa beda toilta batha walkra walksa phonea moneya shopa ///
mealsa chaira stoopa lifta mapa medsa clim1a dimea armsa pusha hibpe diabe cancre ///
lunge hearte stroke psyche arthre smokev back doctor hosp homcar nrshom bmigte30 ///
walk1a sita climsa

*** Start from wave 2 because we have data on ADLs/IADLs since then

foreach var in `corrigenda' {
	forvalues wv=3(1)13 {	
		local wvm1=`wv'-1
		local wvp1=`wv'+1
	replace r`wv'`var'= 1 if r`wvm1'`var'== 1 & r`wvp1'`var' == 1
	replace r`wv'`var' = 0 if r`wvm1'`var'== 0 & r`wvp1'`var'== 0
	
		}
}

*** Start from wave 3 because we don't have r2shltfairpoor --> WHY DON'T WE?

foreach var in shltfairpoor {
	forvalues wv=4(1)13 {	
		local wvm1=`wv'-1
		local wvp1=`wv'+1
	replace r`wv'`var'= 1 if r`wvm1'`var'== 1 & r`wvp1'`var' == 1
	replace r`wv'`var'= 0 if r`wvm1'`var'== 0 & r`wvp1'`var'== 0
	
		}
}

*****************
*****************
* RECODE FIRST INTERVIEW FOR 1993 and 1994
*****************
*****************

* Recode firstiw to reflect the fact that I am treating the 1993 AHEAD as being
* part of the 1994 HRS and the 1995 AHEAD as the 1996 HRS
recode firstiw 1993=1994
recode firstiw 1995=1996

*****************
*****************
* CREATE BEHAVIORAL VARIABLES FROM FIRST YEAR OF INTERVIEW
*****************
*****************

*Create variables which indicate certain behaviors at the start of one's interview
gen smokevFirstInterview=.
gen obeseFirstInterview=.
gen vigactFirstInterview=.

local wv=1

foreach yr in 1992 1994 1996 1998 2000 2002 2004 2006 2008 2010 2012 2014 2016 2018{

	replace smokevFirstInterview=r`wv'smokev if `yr'==firstiw
	
	replace obeseFirstInterview=r`wv'bmigte30 if `yr'==firstiw
	
	replace vigactFirstInterview=vigact`wv' if `yr'==firstiw
	
	local wv=`wv'+1

}

*** Save dataset 
save "$dtafiles/panel_wide", replace

**************
**************
* RESHAPE
**************
**************

u "$dtafiles/panel_wide", clear

reshape long r@eata r@nhmliv r@issdi r@isdi r@sayret r@isret r@work r@mstat s@iwstat ///
r@iwstat r@dressa r@beda r@toilta r@batha r@walkra r@walksa r@phonea r@moneya r@shopa ///
r@mealsa r@chaira r@stoopa r@lifta r@mapa r@medsa r@clim1a r@dimea r@armsa r@pusha r@hibpe ///
r@diabe r@cancre r@lunge r@hearte r@stroke r@psyche r@arthre r@bmi r@bmigte30 r@smokev r@back ///
r@doctor r@doctim r@hosp r@homcar r@nrshom r@cogtot cogtotadj@ cesdadj@ r@shlt r@shltfairpoor ///
r@smoken r@agey_e h@cpl vigact inw@  r@oopmd  r@oopmdf VitalStatus ///
r@iearn s@iearn r@higov r@govmd r@govmr ///
s@sayret h@atotb /// 
r@ipena s@ipena s@isret r@igxfr s@igxfr s@isdi r@iunwc s@iunwc /// 
h@icap h@iothr h@itot h@ifssi h@ifcap h@ifothr h@iftot  ///   
r@lbrf s@lbrf s@retyr r@slfemp s@slfemp ///            
r@mrprem r@rxprem r@prprm1 r@prprm2 r@prprm3 r@prpcnt ///
r@imrc r@dlrc r@ser7 r@bwc20 r@pres r@vp r@mo r@dy r@dw r@yr r@cact r@scis ///
r@fimrc r@fdlrc r@fser7 r@fbwc20 r@fpres r@fvp r@fmo r@fdy r@fdw r@fyr r@fcact r@fscis ///
r@iwendy r@depres r@retyr r@rplnyr ///
r@walk1a r@sita r@climsa ///
r@iwendm r@agem_e r@nrstim r@nrsnit r@nhmday, i(rahhidpn) j(wave) 

/*lines 875-876: cognition variables and flag for imputation
line 877: other variables that did not get reshaped */



************
************
 * GENERATE YEAR 
************
************
gen year = . 
local yr=1992
forvalues wv=1(1)14{
	replace year =`yr' if wave==`wv'
	local yr=`yr'+2
}

**************
**************
* DROP OBS IN WHICH RESPONDENT WAS NOT PRESENT, BUT KEEP THE ONES FOR WHICH SOME RESPONDENTS WERE DEAD
**************
**************

*** These used to be in reshape, but I don't like that. I'd rather create them here.
gen Dead = .
gen Alive = . 


*replace Dead=0 if year>=firstiw
*replace Dead=1 if year>=exdeathyr

replace Dead=1 if riwstat==5 | riwstat==6
replace Dead=0 if riwstat==1 | riwstat==4 | riwstat==7
replace Alive=1-Dead


order rahhidpn year Dead exdeathyr ragey_e

gen HypoAge=year-rabyear

**** Here we redefine age at death as last alive +2, because the mortality model is a 2 year model

replace ragey_e=HypoAge if Dead==1 & ragey_e==.

drop HypoAge

/* Longer Method to accomplish the same goal as the above code

	gen Temp=.
	replace Temp=ragey_e if Alive==1
	bys rahhidpn: egen AgeLastYearAlive=max(Temp)
	drop Temp
	
	forvalues num in 1(1)13{
	
	replace agey_e=AgeLastYearAlive+2*`num' if year==FirstYearDead+2*(`num'-1)
	
	}
	
*/

/***** generate price index to deflate income and wealth****/
/**** it refers to the previous calendar year as income does  ***/

gen cpi18=.
gen basecpi18=251.107   							// this really is 2018, as all prices are in 2018 dollats in the paper
replace cpi18=(100/basecpi18)*136.2 if year==1992  //refers to 1991, as oop is retrospective question
replace cpi18=(100/basecpi18)*144.5 if year==1994  //1993 , and so on
replace cpi18=(100/basecpi18)*152.4 if year==1996 
replace cpi18=(100/basecpi18)*160.5 if year==1998 
replace cpi18=(100/basecpi18)*166.6 if year==2000 
replace cpi18=(100/basecpi18)*177.1 if year==2002 
replace cpi18=(100/basecpi18)*184.0 if year==2004 
replace cpi18=(100/basecpi18)*195.3 if year==2006 
replace cpi18=(100/basecpi18)*207.342 if year==2008 
replace cpi18=(100/basecpi18)*214.537 if year==2010 
replace cpi18=(100/basecpi18)*224.939 if year==2012 
replace cpi18=(100/basecpi18)*232.957 if year==2014 
replace cpi18=(100/basecpi18)*237.017 if year==2016
replace cpi18=(100/basecpi18)*245.120 if year==2018


**************
**************
* LABEL VARS
**************
**************

label variable reata "Difficulty eating"
label variable rdressa "Difficulty dressing "
label variable rbeda "Difficulty getting in or out of bed"
label variable rtoilta "Difficulty using the toilet, including getting up or down"
label variable rwalkra "Difficulty walking across a room"
label variable rphonea "Difficulty making phone calls"
label variable rmoneya "Difficulty managing money, such as paying bills and keeping track of expenses"
label variable rshopa "Difficulty shopping for groceries"
label variable rmealsa "Difficulty preparing a hot meal"
label variable rchaira "Difficulty getting up from a chair after sitting for long periods"
label variable rstoopa "Difficulty stooping, kneeling, or crouching"
label variable rlifta "Difficulty lifting or carrying weights over 10 pounds"
label variable rmapa "Difficulty using a map to figure out how to get around in a strange place"
label variable rmedsa "Difficulty taking medication"
label variable rclim1a "Difficulty climbing one flight of stairs "
label variable rdimea "Difficulty picking up a dime from a table"
label variable rarmsa "Difficulty reaching or extending your arms above shoulder level"
label variable rpusha "Difficulty pulling or pushing large objects like a living room chair"
label variable rhibpe "Ever diagnosed with high blood pressure"
label variable rdiabe "Ever diagnosed with diabetes"
label variable rcancre "Ever diagnosed with cancer"
label variable rlunge "Ever diagnosed with lung disease"
label variable rhearte "Ever diagnosed with a heart condition"
label variable rstroke "Ever diagnosed as having had a stroke"
label variable rpsyche "Ever diagnosed with a nervous, emotional, or psychological problem"
label variable rarthre "Ever diagnosed with arthritis"
label variable rbmi "Body mass index (kg/m²)"
label variable rbmigte30 "Has BMI equal to or greater than 30"
label variable rsmokev "Has ever smoked cigarettes"
label variable rback "Back pain or problems"
label variable rdoctor "Doctor visit in the last twelve months years"
label variable rdoctim "Number of doctor visits in the last two years"
label variable rhosp "Hospital stay in the last two years"
label variable rhomcar "Home health care in the last two years"
label variable rnrshom "Nursing home stay in the last two years"
label variable rcogtot "Total cognition summary score (defined by the HRS)"
label variable cogtotadj "Total cognition summary score inverted and set between 0 and 1"
label variable cesdadj "Total depression summary score set between 0 and 1"
label variable rshlt "Self-reported health status: 1=excellent, 2=very good, 3=good, 4=fair, 5=poor"
label variable rshltfairpoor "Binary variable: 1=SRHS of 4 or 5, 0=SRHS of 1, 2, or 3"
label variable rsmoken "Smokes now"
label variable ragey_e "Age in years at the end of the interview"
label variable hcpl "Coupled "
label variable vigact "Vigorous exercise"
label variable rprprm1 "Insurance Plan 1 premium"
label variable rprprm2 "Insurance Plan 2 premium"
label variable rprprm3 "Insurance Plan 3 premium"
label variable rprpcnt "Number of Private Plans"
label variable rmrprem "Medicare/Medicaid Premium"
label variable rgovmr "Respondent is covered by Medicare"
label variable rgovmd "Respondent is covered by Medicaid"
label variable rhigov "Respondent is covered by gov plan"
label variable rrxprem "Medicare Part D Prescription Premium"

/* Need to labels to the following:
year          riwstat       risret        EDU2_SSGAC16  vigactFirs~w 	rclimsa
Dead          rnhmliv       siearn        EDU3_SSGAC18  pc				risdi
wave          roopmd        rsayret       EA3_W23_S~18  HRSPC
rmstat        roopmdf       rwork         VitalStatus   Alive
siwstat       riearn        rwalksa       smokevFirs~w	rwalk1a
inw           rissdi        rbatha        obeseFirst~w	rsita
*/

***************************
***************************
* RECODE "don't do" AS 2
***************************
***************************

* Here, I recode .x (don't do) for ADLs/IADLs as 2 in all those remaining cases
* in which it appears. Only some of them are clarified with follow-up questions 
* about why someone doesn't do a certain activity. For the purposes of the 
* frailty index, we will still consider 2 (previously .x) as a missing value,
* however. This will apply mainly for the construction of self reported health 
* status instrumented by objective measures

local recodenda eata dressa beda toilta batha walkra walksa phonea moneya shopa ///
mealsa chaira stoopa lifta mapa medsa clim1a dimea armsa pusha ///
walk1a sita climsa

foreach var in `recodenda' {
	recode r`var' (.x=2)
}

*********************************************************************************
*********************************************************************************
**************************** SAVE DATA BEFORE FRAILTY ***************************
*********************************************************************************
*********************************************************************************

save "$dtafiles/Panel_before_frailty", replace

*********************************************************************************
*********************************************************************************
***************** CONSTRUCT FRAILTY INDEX (AND ITS VARIANTS) ********************
*********************************************************************************
*********************************************************************************


u "$dtafiles/Panel_before_frailty", clear 
do "$dofiles/ConstructFrailty" 

*********************************************************************************
*********************************************************************************
*********************** GENERATE COHORT VARIABLES *******************************
*********************************************************************************
*********************************************************************************

/*
gen cohort6=.
replace cohort6=1 if rabyear>=1890
replace cohort6=2 if rabyear>=1904
replace cohort6=3 if rabyear>=1918
replace cohort6=4 if rabyear>=1932
replace cohort6=5 if rabyear>=1946
replace cohort6=6 if rabyear>=1962
replace cohort6=. if rabyear>1976
*/

gen cohort=.
replace cohort=1 if rabyear>=1890
replace cohort=2 if rabyear>=1906
replace cohort=3 if rabyear>=1922
replace cohort=4 if rabyear>=1938
replace cohort=5 if rabyear>=1954
replace cohort=. if rabyear>=1970

gen cohortalt=.
replace cohortalt=1 if rabyear>=1890
replace cohortalt=2 if rabyear>=1910
replace cohortalt=3 if rabyear>=1930
replace cohortalt=4 if rabyear>=1950
replace cohortalt=. if rabyear>=1970

forvalues num=1(1)5 {
	
gen c`num'=.
replace c`num'=1 if cohort==`num'
replace c`num'=0 if cohort~=`num' & cohort~=.
}


*********************************************************************************
*********************************************************************************
**************************** CONSTRUCT MORE VARIABLES ***************************
*********************************************************************************
*********************************************************************************


****GENERATE DEMOGRAPHIC VARIABLES

*** Race
gen white=.
replace white=1 if raracem==1
replace white=0 if raracem==2
replace white=0 if raracem==3

gen black=.
replace black=0 if raracem==1
replace black=1 if raracem==2
replace black=0 if raracem==3

gen other=.
replace other=0 if raracem==1
replace other=0 if raracem==2
replace other=1 if raracem==3

*** Gender 
gen male=.
replace male=1 if ragender==1
replace male=0 if ragender==2

gen nonhispanic=.
replace nonhispanic=1 if rahispan==0
replace nonhispanic=0 if rahispan==1

gen hispanic=.
replace hispanic=1 if rahispan==1
replace hispanic=0 if rahispan==0

/* COMPARISON WITH PREVIOUS CODE: I kept ADLs and other variables for wave 2 (1994)
when present. Constructed variables like frailty or instrumented shlt are 
unaffected because of missing values on other ADLs or conditions in 1994. 
Notable difference: the variable NursingHomeNext is created for 1994 too, 
while in previous version it was not. */

******* RENAME VARIABLES FOR COMPATIBILITY WITH OTHER CODES
/* Which ones? Do we need to do this? */

local renominenda eata nhmliv issdi isdi sayret isret work mstat dressa beda toilta ///
batha walkra walksa phonea moneya shopa mealsa chaira stoopa lifta mapa medsa ///
clim1a dimea armsa pusha hibpe diabe cancre lunge hearte stroke psyche arthre ///
bmi  smokev back doctor doctim hosp homcar nrshom cogtot shlt smoken agey_e ///
oopmd oopmdf iearn iwstat higov govmd govmr mrprem rxprem prprm1 prprm2 prprm3 prpcnt ///
walk1a sita climsa

foreach var in `renominenda' {
	rename r`var' `var'
}

rename siearn iearnspouse
rename siwstat iwstatspouse
rename inw InWave
rename rbmigte30 bmigte30
rename rshltfairpoor shltfairpoor

*** Generate binary variables that tell us whether the spouse is dead or alive
gen DeadSpouse=.
gen AliveSpouse=.

replace DeadSpouse=1 if iwstatspouse==5 | iwstatspouse==6
replace DeadSpouse=0 if iwstatspouse==1 | iwstatspouse==4 | iwstatspouse==7
replace AliveSpouse=1-DeadSpouse

gen SpouseDiedThisWave=.
replace SpouseDiedThisWave=1 if iwstatspouse==5
replace SpouseDiedThisWave=0 if (iwstatspouse==1 | iwstatspouse==6 | iwstatspouse==4 | iwstatspouse==7)

sort rahhidpn year

*** Generate binary variables that tell us if the spouse died in the current or last wave

gen SpouseDiedLastWave=.
replace SpouseDiedLastWave=1 if SpouseDiedThisWave[_n-1]==1 & rahhidpn==rahhidpn[_n-1]
replace SpouseDiedLastWave=0 if SpouseDiedThisWave[_n-1]==0 & rahhidpn==rahhidpn[_n-1]

gen SpouseDiedThisOrLastWave=.
replace SpouseDiedLastWave=1 if SpouseDiedThisWave==1 | SpouseDiedLastWave==1
replace SpouseDiedLastWave=0 if SpouseDiedThisWave==0 & SpouseDiedLastWave==0


********************************************************************************
**** Construct outcome variables from Hosseini, Kopecky, and Zhao (2022)  ******
********************************************************************************

/* We are going to construct variables using future values, so we use tsset. This
takes into account the fact that we might not have adjacent waves. If, for instance, 
we observe someone only in wave 3 and 5, F.variable (in wave 3) = . Instead, if 
we used variable[_n+1], it would take the value of wave 5.*/

sort rahhidpn wave
destring rahhidpn,replace
tsset rahhidpn wave

************************ DEATH NEXT PERIOD *************************************
gen DeadNext=.

by rahhidpn: replace DeadNext=1 if F.Dead==1
by rahhidpn: replace DeadNext=0 if F.Dead==0
by rahhidpn: replace DeadNext=. if Dead==1

/* Careful: in the balanced version of the panel, DeadNext will be 1 in all
waves AFTER the respondent is dead. */

** Check that there are no zombies
*count if Dead==1 & DeadNext==0


*** Generate surv, a binary variable indicating if a respondent survives to the next wave
gen surv = 1 - DeadNext

************************ NH ENTRY NEXT PERIOD **********************************
/* We want a variable that tells us, in wave t, whether the respondent enters 
a nursing home in wave t+1 */

*** Look at the transition matrix between current and next wave
gen Fnhmliv = F.nhmliv
tab nhmliv Fnhmliv
/* We will construct entry only for those observations that DO NOT live in a NH
in the current wave (so those in the first row of the matrix) */

gen nhm_ent_next = .
*** Entry equal to 1 if: live in NH in t+1, but not in t
replace nhm_ent_next = 1 if Fnhmliv==1 & nhmliv==0 
*** Entry equal to 0 if: do not live in NH in t+1, and did not in t
replace nhm_ent_next = 0 if Fnhmliv==0 & nhmliv==0

*** Also set entry equal to 1 if the respondent died in a NH but did not live
*** there in the last wave before dying
replace nhm_ent_next = 1 if DeadNext==1 & renhmliv==1 & renhmday>=15 & nhmliv==0


/* ROSS' WAY: CAN DELETE AFTER MARGHE HAS CHECKED 
gen Lnhmliv = nhmliv[_n-1] if rahhidpn==rahhidpn[_n-1]
gen 	nhm_ent = 0 if nhmliv == 0 & Lnhmliv==0
replace nhm_ent = 1 if nhmliv == 1 & Lnhmliv==0
gen nhm_ent_next=.
replace nhm_ent_next=nhm_ent[_n+1] if rahhidpn==rahhidpn[_n+1]
replace nhm_ent_next = 1 if DeadNext==1 & renhmliv==1 & renhmday>=15 
replace nhm_ent_next=. if nhm_ent==1
*/

************************ SSDI NEXT PERIOD **************************************
/* We want a variable that tells us, in wave t, whether the respondent will start
receiving disability benefits in wave t+1 */

/* For comparability with HKZ, first create a variable that includes both SSI and
SSDI. We also add the condition that age must be smaller than 66, because SSDI 
gets converted to Social Security benefits once people reach the full retirement 
age. Also, HKZ do it this way. Be careful, though, because SSI does not go away
with the full retirement age, so we might still have people getting that. */

*** Create a flag for nonmissing income from SSDI
gen ssdi = 0 if issdi!=.
*** Replace with 1 if ssdi income is positive 
replace ssdi = 1 if inrange(issdi, 1, 500000)

*** Generate ssdi in the next period
by rahhidpn: gen Fssdi = F.ssdi
tab ssdi Fssdi

*** Now generate SSDI "entry" in the same way as we did for NH
gen ssdi_trans_next = .
replace ssdi_trans_next=1 if Fssdi==1 & ssdi==0 
replace ssdi_trans_next=0 if Fssdi==0 & ssdi==0 

*** Finally, replace with missing for people older than 66
replace ssdi_trans_next=. if agey_e>=66

*** Repeat all of the above, but for disability only
gen ssdi_dionly = 0 if isdi!=.
replace ssdi_dionly = 1 if inrange(isdi,1,500000)
by rahhidpn: gen Fssdi_dionly = F.ssdi_dionly
tab ssdi_dionly Fssdi_dionly
gen ssdi_dionly_trans_next = .
replace ssdi_dionly_trans_next = 1 if Fssdi_dionly==1 & ssdi_dionly==0 
replace ssdi_dionly_trans_next = 0 if Fssdi_dionly==0 & ssdi_dionly==0 

/* ROSS' WAY: CAN DELETE AFTER MARGHE HAS CHECKED 
gen ssdi = 0 if issdi!=.
*** Replace with 1 if ssdi income is positive 
replace ssdi = 1 if inrange(issdi, 1, 500000)
/* this section is only SSDI
sum risdi,detail
tabstat risdi, by(age)
replace risdi = . if age >= 66
gen 	ssdi = 0 if risdi == 0
replace ssdi = 1 if inrange(risdi, 1, 500000)
*/
gen 	Lssdi = ssdi[_n-1] if rahhidpn==rahhidpn[_n-1]
gen		ssdi_trans = 0 if ssdi==0 & Lssdi==0
replace	ssdi_trans = 1 if ssdi==1 & Lssdi==0

gen ssdi_trans_next_ross=.
replace ssdi_trans_next_ross=ssdi_trans[_n+1] if rahhidpn==rahhidpn[_n+1]
*/
******************** SELF-REPORTED RETIREMENT NEXT PERIOD **********************
/* We want a variable that, in wave t, tells us what the self-reported retirement
status is in wave t+1 */
tab sayret, m

by rahhidpn: gen Fsayret = F.sayret
tab sayret Fsayret

gen sayret_trans_next = . 
*** 0 if you will not be retired in t+1 and were not in t
replace sayret_trans_next = 0 if Fsayret==0 & sayret==0
*** 0 also if you are partially retired in t+1 and were not in t
replace sayret_trans_next = 0 if Fsayret==2 & sayret==0 
*** 1 if you are retired in t+1 but not in t
replace sayret_trans_next = 1 if Fsayret==1 & sayret==0 
*** 1 if you are retired in t+1 and were partially retired in t
replace sayret_trans_next = 1 if Fsayret==1 & sayret==2


/*  ROSS' WAY: CAN DELETE AFTER MARGHE HAS CHECKED 
*** Say that you are retired next period 
gen Lsayret = sayret[_n-1] if rahhidpn==rahhidpn[_n-1]
gen sayret_trans = .
replace sayret_trans=0 if sayret==0 & Lsayret==0
replace sayret_trans=0 if sayret==2 & Lsayret==0
replace sayret_trans=1 if sayret==1 & Lsayret==0
replace sayret_trans=1 if sayret==1 & Lsayret==2

gen sayret_trans_next_ross=.
replace sayret_trans_next_ross=sayret_trans[_n+1] if rahhidpn==rahhidpn[_n+1]
*/

******************** CLAIMING SS BENEFITS NEXT PERIOD ************************
/* We want a variable that, in wave t, tells us whether the respondent reports
positive income from social security retirement benefits in wave t+1. This looks
a litte different because isret is a continuous variable. */

by rahhidpn: gen Fisret = F.isret
gen isret_trans_next = .
*** 0 if no income from SS benefits in wave t+1 and none in t
replace isret_trans_next=0 if Fisret==0 & isret==0 
*** 1 if some income from SS benefits in wave t+1 and none in t. 
*** We must add Fisret!=. because missing values are greater than 0
replace isret_trans_next=1 if Fisret>0 & isret==0 & Fisret!=.
*** missing if it's the last wave, because in the last wave Fisret is missing 
*** but it's also greater than 0 because Stata considers .>0
replace isret_trans_next=. if wave==14 & Fisret==.

*** Check 
*** isret_trans_next in wave 3 should be missing
*br rahhidpn wave mstat isret isret_trans_next if rahhidpn==010063010

/*  ROSS' WAY: CAN DELETE AFTER MARGHE HAS CHECKED 
gen Lisret = isret[_n-1] if rahhidpn==rahhidpn[_n-1]
gen isret_trans = .
replace isret_trans=0 if isret==0 & Lisret==0
replace isret_trans=1 if isret>0 & Lisret==0
gen isret_trans_next_ross=.
replace isret_trans_next_ross=isret_trans[_n+1] if rahhidpn==rahhidpn[_n+1]
*/

************************** WORKING NEXT PERIOD *********************************
/* We want a variable that tells us, in wave t, whether the respondent is 
working in wave t+1 */
by rahhidpn: gen Fwork = F.work
tab work Fwork

gen work_trans_next=.
replace work_trans_next=0 if Fwork==0 & work==0 
replace work_trans_next=1 if Fwork==1 & work==0 

/*  ROSS' WAY: CAN DELETE AFTER MARGHE HAS CHECKED 
gen Lwork = work[_n-1] if rahhidpn==rahhidpn[_n-1]
gen work_trans = .
replace work_trans=0 if work==0 & Lwork==0
replace work_trans=1 if work==1 & Lwork==0
gen work_trans_next_ross=.
replace work_trans_next_ross=work_trans[_n+1] if rahhidpn==rahhidpn[_n+1]
*/


************************** NOT WORKING NEXT PERIOD *****************************
/* We want a variable that tells us, in wave t, whether the respondent is 
working in wave t+1 */

gen notwork_trans_next = .
*** 0 if respondent works in t+1 and worked in t
replace notwork_trans_next = 0 if Fwork==1 & work==1 
*** 1 if respondent does not work in t+1 but worked in t
replace notwork_trans_next = 1 if Fwork==0 & work==1

/*  ROSS' WAY: CAN DELETE AFTER MARGHE HAS CHECKED 
gen notwork_trans =.
replace notwork_trans = 0 if work==1 & Lwork==1
replace notwork_trans = 1 if work==0 & Lwork==1
gen notwork_trans_next_ross = notwork_trans[_n+1] if rahhidpn==rahhidpn[_n+1]
*/

*********************************************************************************
*********************************************************************************
*************************** GENERATE VARIABLES **********************************
*********************************************************************************
*********************************************************************************

/* These come from GetHealthMeasurementPanel.do */

*** Generate age variables to various powers
gen age1=agey_e
gen age2=agey_e^2
gen age3=agey_e^3
gen age4=agey_e^4
	
*** Turn id into numeric type
gen rahhidpn_num=rahhidpn
destring rahhidpn_num,  replace


*** Correct binary deficit variables so that they only take 0 and 1 as values
*** In particular, we are recoding values 2 (don't do) with missing.

local list_totvar eata dressa beda toilta batha walkra walksa phonea moneya ///
shopa mealsa chaira stoopa lifta mapa medsa clim1a dimea armsa pusha hibpe diabe ///
cancre lunge hearte stroke psyche arthre bmigte30 smokev hosp nrshom ///
walk1a sita climsa 

foreach var in `list_totvar' {
	tab `var',m
}


foreach var in `list_totvar' {
	replace `var'=. if `var'~=1 & `var'~=0
}


*** Generate more demographic variables by gender and race
gen BlackNonHisp = 1 if black==1 & hispanic==0
replace BlackNonHisp=0 if black==0 | hispanic==1
gen WhiteNonHisp = 1 if white==1 & hispanic==0
replace WhiteNonHisp=0 if white==0 | hispanic==1

gen WhiteMan = 1 if male==1 & white==1 & hispanic==0
replace WhiteMan=0 if male==0 | white==0 | hispanic==1

gen WhiteWoman = 1 if male==0 & white==1 & hispanic==0
replace WhiteWoman=0 if male==1 | white==0 | hispanic==1

gen BlackMan = 1 if male==1 & black==1 & hispanic==0
replace BlackMan=0 if male==0 | black==0 | hispanic==1

gen BlackWoman = 1 if male==0 & black==1 & hispanic==0
replace BlackWoman=0 if male==1 | black==0 | hispanic==1

gen HispanicMan = 1 if male==1 & hispanic==1
replace HispanicMan=0 if male==0 | hispanic==0

gen HispanicWoman = 1 if male==0 & hispanic==1
replace HispanicWoman=0 if male==1 | hispanic==0

*** Generate outcome variable for multinomial logit 
/* Not really sure where we are going to use this. */

gen hs = 0 if shlt==1 | shlt==2 | shlt==3
replace hs = 1 if shlt==4 | shlt==5
replace hs = 2 if nrshom==1
replace hs = 3 if Dead==1

gen hs_exp = 0 if shlt==1
replace hs_exp = 1 if shlt==2
replace hs_exp = 2 if shlt==3
replace hs_exp = 3 if shlt==4
replace hs_exp = 4 if shlt==5
replace hs_exp = 5 if nrshom==1
replace hs_exp = 6 if Dead==1

gen next_hs = hs[_n+1] if rahhidpn==rahhidpn[_n+1] 
gen next_hs_exp = hs_exp[_n+1] if rahhidpn==rahhidpn[_n+1] 

gen next_shlt = shlt[_n+1] if rahhidpn==rahhidpn[_n+1] 

foreach x of numlist 32 34 35 {
	gen next_frailty`x' = frailty`x'[_n+1] if rahhidpn==rahhidpn[_n+1] 
}

foreach x of numlist 25 23 22 {
	gen next_frailty_self`x' = frailty_self`x'[_n+1] if rahhidpn==rahhidpn[_n+1] 
}


foreach x of numlist 10 9 8 {
	gen next_frailty_diag`x' = frailty_diag`x'[_n+1] if rahhidpn==rahhidpn[_n+1] 
}


*** Generate new variables related to health status

gen vgood=1 if shlt==2
replace vgood=0 if shlt==1
replace vgood=2 if shlt==(3|4|5)


gen good=1 if shlt==3
replace good=0 if shlt==1
replace good=2 if shlt==(2|4|5)

gen fair=1 if shlt==4
replace fair=0 if shlt==1
replace fair=2 if shlt==(3|2|5)

gen poor=1 if shlt==5
replace poor=0 if shlt==1
replace poor=2 if shlt==(3|4|2)

*** Generate a new race variable that excludes races other than White, Black, 
*** or Hispanic
gen race=1 if WhiteNonHisp==1
replace race=2 if BlackNonHisp==1
replace race=3 if hispanic==1

label define racel 1 "White" 2 "Black" 3 "Hispanic" 
label values race racel

********************************************************************************
************************** ADJUST OUTCOME VARIABLES ****************************
********************************************************************************

*** Create Full Retirement Age variable based on SS rules 
*** https://www.ssa.gov/pressoffice/IncRetAge.html
gen fra_m = . 
replace fra_m = 65*12 if rabyear<=1937
replace fra_m = 65*12 + 2 if rabyear==1938
replace fra_m = 65*12 + 4 if rabyear==1939
replace fra_m = 65*12 + 6 if rabyear==1940
replace fra_m = 65*12 + 8 if rabyear==1941
replace fra_m = 65*12 + 10 if rabyear==1942
replace fra_m = 66*12 if rabyear>=1943 & rabyear<=1954
replace fra_m = 66*12 + 2 if rabyear==1955
replace fra_m = 66*12 + 4 if rabyear==1956
replace fra_m = 66*12 + 6 if rabyear==1957
replace fra_m = 66*12 + 8 if rabyear==1958
replace fra_m = 66*12 + 10 if rabyear==1959
replace fra_m = 67*12 if rabyear>=1960

*** Adjust variable for receiving SS retirement benefits 
replace isret_trans_next = . if agey_e<60 | agey_e>75

*** Adjust variable for SDI benefits
*br rahhidpn rabyear year agey_e ragem_e fra_m 
/* We want to answer the question: "will the respondent be older than the FRA
next wave?" If the answer is yes, we don't want to run the regression for SDI for him */
tsset rahhidpn wave
gen fsdi =0
replace fsdi=1 if F.ragem_e>=fra_m 
*br rahhidpn rabyear year agey_e ragem_e fra_m fsdi isdi ssdi_dionly_trans_next
replace ssdi_dionly_trans_next=. if fsdi==1 
/* Notice that this can be done efficiently in one line:
replace ssdi_dionly_trans_next=.if (ragem_e+24)>=fra_m
but I wanted to browse the data first */

*** Create outcome variable for rassageb
sort rahhidpn wave
by rahhidpn: gen Fagey_e = F.agey_e
*br rahhidpn year agey_e rassageb isret isdi
gen startss = . 
replace startss = 0 if Fagey_e<rassageb & agey_e<rassageb
replace startss = 1 if Fagey_e>=rassageb & agey_e<rassageb
*br rahhidpn year agey_e rassageb isret isdi startss
replace startss = . if agey_e>75

********************************************************************************
************************** ADJUST MSTAT VARIABLE *******************************
********************************************************************************

ta mstat

*** Recode marital status variable
gen marstat = .
replace marstat = 1 if mstat==1|mstat==2
replace marstat = 2 if mstat==3
replace marstat = 3 if mstat==4|mstat==5|mstat==6|mstat==7|mstat==8

label define marstatl 1 "Married" 2 "Partnered" 3 "Single"
label values marstat marstatl
ta marstat

*********************************************************************************
*********************************************************************************
***************************** SAVE BALANCED PANEL  *****************************
*********************************************************************************
*********************************************************************************

save "$dtafiles/CleanPanelBalanced.dta", replace
export delimited using "$dtafiles/CleanPanelBalanced.csv", replace
!gzip "$dtafiles/CleanPanelBalanced.csv"

*********************************************************************************
*********************************************************************************
**************************** SAVE UNBALANCED PANEL  *****************************
*********************************************************************************
*********************************************************************************


/*Need to be careful about timing of death: some people die in the same 
year/wave as their last Core interview. According to our definition of Dead, 
they are recorded as alive in the last Core interview and as Dead in the 
following one. Need to adjust exit variables (roopmd...) using 
"number of months between last core interview and death", which is radtimtdth, 
to rescale them in 2-year interval (so that everything is in 2 years intervals. 
Age has already been adjusted)
*/

gen inw_or_justdead=InWave
sort rahhidpn wave
by rahhidpn: replace inw_or_justdead=1 if Dead==1 & Dead[_n-1]==0

**** Keep non-missing values + one obs for death info
drop if inw_or_justdead!=1

*** Keep only non missing values 
drop if InWave~=1

*** Save
save "$dtafiles/CleanPanel.dta", replace

*** Erase output that are not necesary for running the .R code
erase "$dtafiles/CleanPanel.dta"
erase "$dtafiles/Panel_before_frailty.dta"
erase "$dtafiles/panel_wide.dta"
erase "$dtafiles/HRSTracker.dta"

*** Create self_prob_dying.dta 
do "$dofiles/prob_death"



