rm(list=ls())

computer.code <- 1

library(data.table)
library(readstata13)
library(plyr)
library(R.matlab)
library(statar)
library(weights)

## --- Load CEX data from Heathcote Perri Violante.

df <- as.data.table(read.dta13('cex_b.dta'))

## --- Sample selection

#df2 <- subset(df,intyea<91)
#df2 <- subset(df,rururb==1)

df2 <- df

df2$npers <- df2$nmo15 + df2$nwo15 + df2$nmu15 + df2$nwu15 + df2$nbab

df2$age80<-df2$refage-df2$intyea+80

df2$coho <- cut(df2$age80,breaks=seq(from=11,to=86,by=5))
df2$ageg <- cut(df2$refage,breaks=c(24,29,34,39,44,50,55,60))

#df2 <- subset(df2,age80<81)
df2$yqua <- 100*df2$intyea+df2$intqua

# Focus on NDPND2 - adapted measure of deflated consumption, corrected for some problems
# in the CEX

# Follow their trimmings

df2 <- subset(df2,earnings>0)
df2 <- subset(df2,tia>0)

qdye <- quantile(log(df2$tia/df2$aduoecd),probs=0.005)
qtia <- quantile(log(df2$earnings/df2$aduoecd),probs=0.005)

df2 <- subset(df2,log(df2$earnings/df2$aduoecd)>=qtia)
df2 <- subset(df2,log(df2$tia/df2$aduoecd)>=qdye)


df2$logcons <- log(df2$nondurnd)
df2$logequi <- log(df2$nondurnd/df2$aduoecd)
df2 <- subset(df2,is.na(logcons)==0)

# Equivalize logcons using a regression

regre.equiv <- lm(logcons ~ 0 + factor(npers),data=df2)
df2$logequir <- residuals(regre.equiv)
  
#Variances of residuals by age

my.yea.equir <- lm(logequir ~ factor(intyea),data=df2)
my.coh.equir <- lm(logequir ~ factor(age80),data=df2)

df2$res.yea.equir <- residuals(my.yea.equir)
df2$res.coh.equir <- residuals(my.coh.equir)

equir <- df2[,list(mycoh=var(res.coh.equir),myyea=var(res.yea.equir)),by="ageg"]
equir <- subset(equir,is.na(ageg)==0)
setkeyv(equir,"ageg")
equir$mycoh <- equir$mycoh - equir$mycoh[1]
equir$myyeatot <- equir$myyea
equir$myyea <- equir$myyea - equir$myyea[1]

# Now the Kaplan 2012 procedure, for equir

regre.kar <- lm(logequir ~ factor(ageg) + factor(intyea),data=df2)
df2$regkar <- residuals(regre.kar)
df2$sqkar <- df2$regkar^2

damm <- df2

numage <- damm[,list(numage=length(id)),by="ageg"]
numageyear <- damm[,list(numageyear=length(id)),by=c("intyea","ageg")]

damm <- merge(damm,numage,by="ageg")
damm <- merge(damm,numageyear,by=c("ageg","intyea"))
damm$numyear <- (max(damm$intyea)-min(damm$intyea)+1)


damm$check <- damm$numage/(damm$numyear*damm$numageyear)

check <- damm[,mean(check),by="ageg"]

damm$kasum <- damm$sqkar*damm$numage/(damm$numyear*damm$numageyear)

ka.ager <- damm[,mean(kasum),by="ageg"]

ka.ager$age <- seq(27.5,57.5,5)
ka.ager$ageg <- NULL

ka.ager <- rename(ka.ager,c("V1"="kaage"))

folder <- "Outputs/"

write.csv(ka.ager,file=paste0(folder,"data_varlogcons_ka_reg.csv"),row.names=F)