#THIS CODE

rm(list=ls())

library(data.table)
library(ggplot2)
library(Hmisc)
library(readstata13)
library(plm)
library(plyr)
library(psych)
library(R.matlab)
library(statar)
library(tidyr)
library(splitstackshape)

#load("psid2.RData")
# This code performs several computations and transformations from a previously cleaned
# PSID sample, stored in one data.table called dam where the personal identifier (of
# head of household) is personid

folder <- "Outputs/"

dam <- subset(dam,select=c("year","Age","AssetFarmIncome","AssetBusinessIncome",
                           "HeadRentInterest","WifeAssetIncome","AssetGardening",
                           "AssetRoomers","HeadRent","HeadInterest","TotalFamilyIncome",
                           "HWTaxableIncome","LaborEarningsHead","LaborEarningsWife1",
                           "NumberPeople","AssetIncomeOthersFU","FederalIncomeTax",
                           "IncomeOthers","personid","Sex"))

## CONSTRUCTION OF POST-TAX HOUSEHOLD LABOR INCOME ----
# This is described in Appendix A.3, and broadly follows
# Guvenen and Smith (ecta, 2014)

# First build asset income

dam$assetincome <- 0
p1 <- dam$year>1975 & dam$year<1979

dam$assetincome[p1] <- dam$AssetFarmIncome[p1] + dam$AssetBusinessIncome[p1] +
  dam$HeadRentInterest[p1] + dam$WifeAssetIncome[p1] 

p2 <- dam$year>1978 & dam$year<1984

dam$assetincome[p2] <- dam$AssetFarmIncome[p2] + dam$AssetBusinessIncome[p2] +
  dam$HeadRentInterest[p2] + dam$WifeAssetIncome[p2]  +
  dam$AssetGardening[p2] + dam$AssetRoomers[p2]
  
p3 <- dam$year>1983 & dam$year<1993

dam$assetincome[p3] <- dam$AssetFarmIncome[p3] + dam$AssetBusinessIncome[p3] +
  dam$HeadRent[p3] + dam$HeadInterest[p3] + dam$WifeAssetIncome[p3] +
  dam$AssetGardening[p3] + dam$AssetRoomers[p3]

# Now build nonfinancial income by appropriately subtracting asset income
# whenever available

dam$nonfin <- 0

p1 <- dam$year<1976

dam$nonfin[p1] <- dam$TotalFamilyIncome[p1] -
  dam$HWTaxableIncome[p1] + dam$LaborEarningsHead[p1] +
  dam$LaborEarningsWife1[p1]

p2 <- dam$year>1975 & dam$year<1984

dam$nonfin[p2] <- dam$TotalFamilyIncome[p2] - dam$assetincome[p2]

p3 <- dam$year>1983 & dam$year<1993

dam$nonfin[p3] <- dam$TotalFamilyIncome[p3] - dam$assetincome[p3] -
  dam$AssetIncomeOthersFU[p3]

dam$LaborEarningsWife <- NA
dam$LaborEarningsWife[dam$year<1993] <- dam$LaborEarningsWife1[dam$year<1993]
dam <- subset(dam,nonfin>0)

# TAXATION

fica.data <- read.csv(paste0(folder.from,"fica_data.csv"))
fica.data <- rename(fica.data,c("tax"="sstax","cap"="sscap"))
fica.data$year <- fica.data$year+1 #PSID income measures refer to previous year
dam <- merge(dam,fica.data,by="year",all.x=T)
dam$sstaxliab <- dam$sstax*pmin(dam$LaborEarningsHead,dam$sscap)+dam$sstax*pmin(dam$LaborEarningsWife1,dam$sscap)
dam$IncomeTax <- dam$FederalIncomeTax + dam$sstaxliab

# Info in fica_data is not deflated.
# So need to deflate here

# Load CPIU
cpi <- as.data.table(read.csv(paste0(folder.from,"cpiu.csv"),stringsAsFactors=FALSE))
cpi <- rename(cpi,c("DATE"="year","VALUE"="cpi"))
cpi$cpi <- cpi$cpi/(cpi$cpi[68]) #year 2013
cpi$year <- cpi$year+1 # PSID income variables refer to previous year

setkeyv(dam,"year")
setkeyv(cpi,"year")
dam <- merge(dam,cpi,all.x=TRUE)

dam$LaborEarningsHead <- dam$LaborEarningsHead/dam$cpi
dam$TotalFamilyIncome <- dam$TotalFamilyIncome/dam$cpi
dam$LaborEarningsWife1 <- dam$LaborEarningsWife1/dam$cpi
dam$IncomeOthers <- dam$IncomeOthers/dam$cpi
dam$HWTaxableIncome <- dam$HWTaxableIncome/dam$cpi
dam$FederalIncomeTax <- dam$FederalIncomeTax/dam$cpi
dam$assetincome <- dam$assetincome/dam$cpi    
dam$nonfin <- dam$nonfin/dam$cpi 

dam$labtotal <- dam$LaborEarningsHead+dam$LaborEarningsWife1
dam$labsq <- dam$labtotal^2
dam$nonfinsq <- dam$nonfin^2
dam$assetsq <- dam$assetincome^2

taxregre <- lm(IncomeTax ~ nonfin + nonfinsq + assetincome + assetsq,data=subset(dam,year<1992)) 
# change in FICA system + availability of data: we estimate this in the sample up to 1990 
# and keep the same functional form for the post-1990 years

# Generate nonfinancial disposable earnings:
dam$ry <- dam$nonfin - taxregre$coefficients[2]*dam$nonfin - 
  taxregre$coefficients[3]*(dam$nonfinsq)
dam[,ny := shift(ry,type="lead"),by="personid"]

# And other measures:
dam$pretaxincome <- dam$HWTaxableIncome+dam$IncomeOthers
dam$disposable <- dam$TotalFamilyIncome - dam$IncomeTax

dam.du <- subset(dam,select=c("personid","Age","year","pretaxincome",
                              "ry","labtotal","LaborEarningsHead","NumberPeople",
                              "disposable","Sex"))

#save(dam.du,file=paste0(folder,"psid_income_defs.RData"))


# Lower limit on earnings

dam <- subset(dam,ry>1518)
dam <- subset(dam,ny>1518)

dam$logy <- log(dam$ry)
dam$loglabtotal <- log(dam$labtotal)
dam$byear <- dam$year-dam$Age

# Control for year fixed effects + equivalize
regre <- lm(logy ~ factor(year) + factor(NumberPeople),data=dam)
dam$resy <- residuals(regre)


## Adjust sample to apply ABB method

counts <- dam[,list(nobs=length(year)),by=personid]
setkeyv(counts,"personid")
setkeyv(dam,"personid")

dam <- merge(dam,counts)

# NOW pick the first...

dam$uno <- 1
cumsu <- dam[,list(Age,cumsum(uno)),by=personid]

setkeyv(dam,c("personid","Age"))
setkeyv(cumsu,c("personid","Age"))

dam <- merge(dam,cumsu)

i <- 1
Troll <- 3  # number for rolling window (to be chosen)
abb <- data.table()
groups.remaining <- 1
while (groups.remaining>0) {
  abtemp <- subset(dam,nobs>=i*Troll)
  abtemp <- subset(abtemp,V2<i*Troll+1)
  abtemp <- subset(abtemp,V2>(i-1)*Troll)
  abb <- rbind(abb,abtemp)
  i <- i+1
  if (nrow(abtemp)==0) {
    groups.remaining <- 0
  }
}

abb.export <- subset(abb,select=c("resy","Age"))

write.table(abb.export,paste0(folder,"psid_forabb.txt"), sep="\t",row.names=F,col.names=F)


# COMPUTATION OF VARIANCES FOLLOWING KAPLAN (2012) PROCEDURE
# + saving data for bootstrap

damm <- dam

regre.stypre <- lm(logy ~ factor(NumberPeople),data=damm)
damm$logsty <- residuals(regre.stypre)

# for regression via equivlz
regre.ka <- lm(logsty ~ factor(Age) + factor(year),data=damm)
damm$regka <- residuals(regre.ka)
damm[,nregka:=shift(regka,type="lead"),by="personid"]
damm[,n2regka:=shift(regka,type="lead",n=2),by="personid"]
damm$sqka <- damm$regka^2
damm$covka <- damm$regka*damm$nregka
damm$cov2ka <- damm$regka*damm$n2regka

damm <- subset(damm,is.na(sqka)==0)
damm <- subset(damm,is.na(sqka)==0 & is.na(covka)==0) ##
dammcov <- subset(damm,is.na(covka)==0)
damm2cov <- subset(damm,is.na(cov2ka)==0)
dammcov2 <- subset(damm,is.na(covka)==0 & is.na(sqka)==0)


numage <- damm[,list(numage=length(personid)),by="Age"]
numageyear <- damm[,list(numageyear=length(personid)),by=c("year","Age")]
damm <- merge(damm,numage,by="Age")
damm <- merge(damm,numageyear,by=c("Age","year"))
damm$numyear <- (max(damm$year)-min(damm$year)+1)
damm$check <- damm$numage/(damm$numyear*damm$numageyear)
check <- damm[,mean(check),by="Age"]
damm$check[damm$check>3.5] <- 3.5 # Limit weights to 3 (above pctile 99.99%)
damm$kasum <- damm$sqka*damm$check
ka.age <- damm[,mean(kasum),by="Age"]
ka.age.more <- damm[,mean(kasum),by=c("Age","year")]

# and covariance

numagec <- dammcov[,list(numage=length(personid)),by="Age"]
numageyearc <- dammcov[,list(numageyear=length(personid)),by=c("year","Age")]
dammcov <- merge(dammcov,numagec,by="Age")
dammcov <- merge(dammcov,numageyearc,by=c("Age","year"))
dammcov$numyear <- (max(dammcov$year)-min(dammcov$year)+1)
dammcov$check <- dammcov$numage/(dammcov$numyear*dammcov$numageyear)
check <- dammcov[,mean(check),by="Age"]
damm$check[damm$check>3.5] <- 3.5
dammcov$kasum <- dammcov$covka*dammcov$check
ka.age.cov <- dammcov[,mean(kasum),by="Age"]
ka.age.cov.more <- dammcov[,mean(kasum),by=c("Age","year")]

# dammcov2 to bootstrap
# same counts for cov and var

numage <- dammcov2[,list(numage=length(personid)),by="Age"]
numageyear <- dammcov2[,list(numageyear=length(personid)),by=c("year","Age")]
dammcov2 <- merge(dammcov2,numage,by="Age")
dammcov2 <- merge(dammcov2,numageyear,by=c("Age","year"))
dammcov2$numyear <- (max(dammcov2$year)-min(dammcov2$year)+1)
dammcov2$check <- dammcov2$numage/(dammcov2$numyear*dammcov2$numageyear)
check <- dammcov2[,mean(check),by="Age"]
dammcov2$check[dammcov2$check>3.5] <- 3.5
dammcov2$kasum <-dammcov2$sqka*dammcov2$check
dammcov2$ckasum <- dammcov2$covka*dammcov2$check
# ka.age <- dammcov2[,mean(kasum),by="Age"]
#ka.age.cov <- dammcov2[,mean(ckasum),by="Age"]

write.csv(ka.age,paste0(folder,"earnvarprofile_age_kaplan.csv"))
write.csv(ka.age.cov,paste0(folder,"earncovarprofile_age_kaplan.csv"))

boot.exp <- subset(dammcov2,select=c("Age","kasum","ckasum"))
boot.exp <- subset(boot.exp, Age>24 & Age<61)
boot.exp <- subset(boot.exp,is.na(kasum)==0 & is.na(ckasum)==0)
write.csv(boot.exp,paste0(folder,"psidsample_boot.csv"))   

# Compute Figure 1 and Figure C.1.

source('functions/incomedefs.R')

# Compute variances with cohort effects for Figure B.1.


regre.ka.coh <- lm(logsty ~ factor(byear) + factor(year),data=damm)
damm$regkacoh <- residuals(regre.ka.coh)
damm[,nregkacoh:=shift(regkacoh,type="lead"),by="personid"]
damm[,n2regkacoh:=shift(regkacoh,type="lead",n=2),by="personid"]
damm$sqkacoh <- damm$regkacoh^2
damm$covkacoh <- damm$regkacoh*damm$nregkacoh
damm$cov2kacoh <- damm$regkacoh*damm$n2regkacoh
damm$numage <- NULL
damm$numageyear <- NULL

damm <- subset(damm,is.na(sqkacoh)==0)
dammcov <- subset(damm,is.na(covkacoh)==0)
damm2cov <- subset(damm,is.na(cov2kacoh)==0)

numage <- damm[,list(numage=length(personid)),by="Age"]
numageyear <- damm[,list(numageyear=length(personid)),by=c("year","Age")]
damm <- merge(damm,numage,by="Age")
damm <- merge(damm,numageyear,by=c("Age","year"))
damm$numyear <- (max(damm$year)-min(damm$year)+1)

damm$check <- damm$numage/(damm$numyear*damm$numageyear)
check <- damm[,mean(check),by="Age"]

damm$check[damm$check>3.5] <- 3.5

damm$kasumcoh <- damm$sqkacoh*damm$check
ka.coh <- damm[,mean(kasumcoh),by="Age"]


ka.compare <- merge(ka.age,ka.coh,by="Age")
ka.compare <- melt(ka.compare,id="Age")
ka.compare <- subset(ka.compare,Age>24 & Age<61)

source('functions/gon_graphs_export.R')

# Figure B.1.

y <- grafico.varias.lineas.bn(ka.compare,"Age","value","variable","Age","Variance",
                           "f_cohyear_variances.pdf")