# This code will generate the following variables:

to_keep <- ls()
start_time <- Sys.time()
setwd(main_path)

# Extract the last two digits to identify HH

# Get Panel data
tracker<- read_dta("rawdata/trk2022tr_r.dta") 

names(tracker) <- tolower(names(tracker))
tracker_birth_year<- tracker %>% mutate(rahhidpn = as.numeric(paste0(hhid, pn))) %>% 
  select(hhid,pn,rahhidpn,birthyr) %>% mutate(birthyr=ifelse(birthyr==0,NA,birthyr))

df <- read_dta("dtafiles/CleanPanelBalanced.dta") %>%  mutate(shlt = ifelse(Dead==1, 6, shlt), frailty_bl = frailty32) %>% arrange(rahhidpn,year)

df <- left_join(df,tracker_birth_year, by = "rahhidpn") %>% mutate(birthyear=birthyr) %>% 
  mutate(agey_e = ifelse(!is.na(agey_e), agey_e, year-birthyear),
         agey_e2= year-birthyear)

file_name<- c('dtafiles/P52_5_Clusters.dta')
file_name_data_clustering<- c('dtafiles/data_clustering.dta')



df$pnumber <- as.numeric(substr(df$rahhidpn, nchar(df$rahhidpn) - 1, nchar(df$rahhidpn)))
df$hhid <- as.numeric(df$rahhidpn-df$pnumber)

## This code will generate the following variables
#age 
#age^2
#age^3
#gender
#marital_status
#marital_status*age^2
#SingleWoman
#SingleWoman*age
#Health_Status
#Health_Status*age
#PI
#PI*age
#PI^2
#PI^2*age
#PI*marital
#PI^2*marital

#######################


# Creating age polinomials
df$age_1 <- as.numeric(df$agey_e)
df$age_2 <- as.numeric(df$age_1)*as.numeric(df$age_1)
df$age_3<-  as.numeric(df$age_2)*as.numeric(df$age_1)

# Creating gender dummy
# 1 for Women
df$genderd <- as.numeric(ifelse(df$ragender==2,1,0))

# Interaction with age
df$genderd_age_1 <- as.numeric(ifelse(!is.na(df$genderd),df$genderd*df$age_1,NA))

# Interaction with age2
df$genderd_age_2 <- as.numeric(ifelse(!is.na(df$genderd),df$genderd*df$age_2,NA))

# Interaction with age3
df$genderd_age_3 <- as.numeric(ifelse(!is.na(df$genderd),df$genderd*df$age_3,NA))


# Dummy for marital status
# 1 for coupled, 0
df$mstatusd <- as.numeric(ifelse(df$hcpl==1,1,
                                 ifelse(df$hcpl==0,0,NA)))

# Interaction with age
df$mstatusd_age_1 <- as.numeric(ifelse(!is.na(df$mstatusd),df$mstatusd*df$age_1,NA))

# Interaction with age2
df$mstatusd_age_2 <- as.numeric(ifelse(!is.na(df$mstatusd),df$mstatusd*df$age_2,NA))

# Interaction with age3
df$mstatusd_age_3 <- as.numeric(ifelse(!is.na(df$mstatusd),df$mstatusd*df$age_3,NA))



#Single woman ==1
df$SingleWoman <- as.numeric(ifelse(df$mstatusd==0 & df$genderd==1,1,
                                    ifelse(df$mstatusd==0 & df$genderd==0,0,
                                           ifelse(df$mstatusd==1,0,NA))))

#Single woman*age
df$SingleWoman_age_1 <- as.numeric(ifelse(!is.na(df$SingleWoman),df$SingleWoman*df$age_1,NA))

# Convert the 'category' variable to a factor
df$srhs <- as.factor(df$shlt)
#Excellent Health
df$srhs_1 <- as.numeric(ifelse(df$srhs==1 & !is.na(df$srhs) ,1,
                               ifelse(df$srhs!=1 & !is.na(df$srhs),0,NA)))
#Very Good Health
df$srhs_2 <- as.numeric(ifelse(df$srhs==2 & !is.na(df$srhs) ,1,
                               ifelse(df$srhs!=2 & !is.na(df$srhs),0,NA)))
#Good Health

df$srhs_3 <- as.numeric(ifelse(df$srhs==3 & !is.na(df$srhs) ,1,
                               ifelse(df$srhs!=3 & !is.na(df$srhs),0,NA)))
#Fair Health

df$srhs_4 <- as.numeric(ifelse(df$srhs==4 & !is.na(df$srhs) ,1,
                               ifelse(df$srhs!=4 & !is.na(df$srhs),0,NA)))
#Poor Health
df$srhs_5 <- as.numeric(ifelse(df$srhs==5 & !is.na(df$srhs) ,1,
                               ifelse(df$srhs!=5 & !is.na(df$srhs),0,NA)))
#Death
df$srhs_6 <- as.numeric(ifelse(df$srhs==6 & !is.na(df$srhs) ,1,
                               ifelse(df$srhs!=6 & !is.na(df$srhs),0,NA)))

## Health Status interacted with Age
df$srhs_1_age_1<-as.numeric(ifelse(!is.na(df$srhs),df$srhs_1*df$age_1,NA))
df$srhs_2_age_1<-as.numeric(ifelse(!is.na(df$srhs),df$srhs_2*df$age_1,NA))
df$srhs_3_age_1<-as.numeric(ifelse(!is.na(df$srhs),df$srhs_3*df$age_1,NA))
df$srhs_4_age_1<-as.numeric(ifelse(!is.na(df$srhs),df$srhs_4*df$age_1,NA))
df$srhs_5_age_1<-as.numeric(ifelse(!is.na(df$srhs),df$srhs_5*df$age_1,NA))
df$srhs_6_age_1<-as.numeric(ifelse(!is.na(df$srhs),df$srhs_6*df$age_1,NA))


## Educational Attainment
# Base on raedyrs
df<- df %>% mutate(educ_cat = ifelse(is.na(raedyrs),NA,ifelse(raedyrs<12,1,
                                     ifelse(raedyrs==12,2,3))))

#Educational Categories dummies
df$educ_cat_1 <- as.numeric(ifelse(df$educ_cat==1 & !is.na(df$educ_cat) ,1,
                                   ifelse(df$educ_cat!=1 & !is.na(df$educ_cat),0,NA)))

df$educ_cat_2 <- as.numeric(ifelse(df$educ_cat==2 & !is.na(df$educ_cat) ,1,
                                   ifelse(df$educ_cat!=2 & !is.na(df$educ_cat),0,NA)))

df$educ_cat_3 <- as.numeric(ifelse(df$educ_cat==3 & !is.na(df$educ_cat) ,1,
                                   ifelse(df$educ_cat!=3 & !is.na(df$educ_cat),0,NA)))


#Educational Categories  interacted with age
df$educ_cat_1_age_1 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_1*df$age_1,NA))

df$educ_cat_2_age_1 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_2*df$age_1,NA))

df$educ_cat_3_age_1 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_3*df$age_1,NA))


# Educational Categories  interacted with age^2
df$educ_cat_1_age_2 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_1_age_1*df$age_1,NA))

df$educ_cat_2_age_2 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_2_age_1*df$age_1,NA))

df$educ_cat_3_age_2 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_3_age_1*df$age_1,NA))

# Educational Categories  interacted with age^3
df$educ_cat_1_age_3 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_1_age_2*df$age_1,NA))

df$educ_cat_2_age_3 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_2_age_2*df$age_1,NA))

df$educ_cat_3_age_3 <- as.numeric(ifelse(!is.na(df$educ_cat),df$educ_cat_3_age_2*df$age_1,NA))


## Variables to estimate permanent income deciles
# Household total income
df <- df %>% mutate(total_individual_income = iearn + issdi + isret)
df <- df %>% group_by(hhid,year) %>% mutate(total_hh_income = sum(total_individual_income, na.rm = TRUE))

# Household structure
df <- df %>% mutate(fam_struct = ifelse(mstatusd==1,1,
                                        ifelse(mstatusd==0 & SingleWoman==1,2,
                                               ifelse(!is.na(mstatusd),3,NA))))

df <- df %>% mutate(fam_struct_1 = ifelse(mstatusd==1,1,
                                          ifelse(!is.na(mstatusd),0,NA)))

df <- df %>% mutate(fam_struct_2 = ifelse(mstatusd==0 & SingleWoman==1,1,
                                          ifelse(!is.na(mstatusd),0,NA)))
df <- df %>% mutate(fam_struct_3 = ifelse(mstatusd==0 & SingleWoman==0,1,
                                          ifelse(!is.na(mstatusd),0,NA)))

df <- df %>% mutate(fam_struct_1_age_1 = ifelse(!is.na(fam_struct_1),fam_struct_1*age_1,NA))
df <- df %>% mutate(fam_struct_2_age_1 = ifelse(!is.na(fam_struct_2),fam_struct_2*age_1,NA))
df <- df %>% mutate(fam_struct_3_age_1 = ifelse(!is.na(fam_struct_3),fam_struct_3*age_1,NA))


################ All necessary variables are created ###################


## Import ID with clusters assignments 
setwd(main_path)
ID <- read_dta(file_name) %>% filter(!is.na(cluster_52_frailty_bl)) %>% select(rahhidpn,cluster_52_frailty_bl) %>% unique()

#Get initial age
data_clustering <- read_dta(file_name_data_clustering)
ID_1 <- data_clustering %>% select(rahhidpn,Initial_Age)

ID <- merge(ID, ID_1, by = "rahhidpn")

#Select our sample
df_new <- df[df$rahhidpn %in% ID$rahhidpn,]

#Get Clustering Assigment
#Merge data bases
new_df <- merge(df_new, ID, by = "rahhidpn") %>% arrange(rahhidpn,agey_e)

#Variables to keep
#Person Demographics
columns_to_keep<- c('rahhidpn', 'hcpl', 'ragender','rahispan','raracem','raedyrs', 
                    'frailty_bl','rabyear' ,'racohbyr', 'year', 'agey_e','shlt','Death','cluster_52_frailty_bl','Initial_Age')

outcomes_to_keep<-c('vigact','rassageb','issdi','isret','govmr','higov','work','Fisret','doctor','doctim')

outcomes_to_keep2<-c('age_1', 'age_2', 'age_3', 'genderd', 'genderd_age_1', 'genderd_age_2', 'mstatusd', 'mstatusd_age_1', 'mstatusd_age_2',
                     'SingleWoman', 'SingleWoman_age_1', 'srhs', 'srhs_1', 'srhs_2', 'srhs_3', 'srhs_4', 'srhs_5', 'srhs_6',
                     'srhs_1_age_1', 'srhs_2_age_1', 'srhs_3_age_1', 'srhs_4_age_1', 'srhs_5_age_1', 'srhs_6_age_1',
                     'fam_struct','fam_struct_1','fam_struct_2','fam_struct_3','fam_struct_1_age_1','fam_struct_2_age_1','fam_struct_3_age_1',
                     'total_hh_income','total_individual_income',
                     'educ_cat','educ_cat_1','educ_cat_2','educ_cat_3','educ_cat_1_age_1','educ_cat_2_age_1','educ_cat_3_age_1',
                     'educ_cat_1_age_2','educ_cat_2_age_2','educ_cat_3_age_2')

v_t_keep<- c('batha','dressa','eata','beda','toilta','walkra','walk1a','walksa',
             'shopa','phonea','moneya','mealsa','medsa','mapa',
             'clim1a','climsa','chaira','stoopa','lifta','armsa','dimea','pusha','sita',
             'hibpe','diabe','cancre','lunge', 'hearte','stroke','psyche','arthre',
             'hosp','nrshom',
             'bmigte30','smokev')

new_df <- new_df[order(new_df$rahhidpn, new_df$year),] %>% filter(agey_e>=Initial_Age) 

new_df <- new_df %>%
  mutate(Death = ifelse(frailty_bl==1,1,0)) %>%
  filter(!is.na(frailty_bl)) %>% select(all_of(columns_to_keep),all_of(outcomes_to_keep),all_of(outcomes_to_keep2),all_of(v_t_keep)) %>%
  arrange(rahhidpn, agey_e) %>%
  group_by(rahhidpn) %>%
  mutate(n_obs = row_number(),max_nobs=max(n_obs)) 


### Compute Permanent Income : Same methodology as De Nardi, French, Jones, McGee (2024)

## Take log of income
new_df<- new_df %>% mutate(log_hh_income=ifelse(!is.na(total_hh_income) & total_hh_income>1000,log(total_hh_income),NA))


data_PI<- subset(new_df,!is.na(log_hh_income) & !is.na(fam_struct))

# Fit linear regression model
formula_PI<- log_hh_income ~ age_1 + age_2 + age_3 + 
  fam_struct_2 + fam_struct_3 + 
  fam_struct_2_age_1 + fam_struct_3_age_1
modelPI <- lm(formula_PI, data = data_PI)

predictions <- predict(modelPI, newdata = data_PI)

# Add the predictions to your original dataframe
data_PI$predicted_log_hh_income <- predictions
data_PI$redidual<- data_PI$log_hh_income -  data_PI$predicted_log_hh_income 

data_PI <- data_PI %>% group_by(rahhidpn) %>% mutate(mean_residual = mean(redidual,na.rm = TRUE)) %>% ungroup()

PI<-data_PI %>% select(rahhidpn,mean_residual) %>% unique() %>% mutate(percrank=rank(mean_residual)/length(mean_residual)*100)

#Get percentiles back
new_df <- full_join(new_df, PI, by = "rahhidpn")


## Generate remaining data
new_df$PI <- new_df$percrank 
new_df$PI_2 <- new_df$PI*new_df$PI 
new_df$PI_age_1 <- ifelse(!is.na(new_df$PI),new_df$PI*new_df$age_1,NA) 
new_df$PI_2_age_1 <- ifelse(!is.na(new_df$PI),new_df$PI_2*new_df$age_1,NA) 
new_df$PI_mstatusd <- ifelse(!is.na(new_df$PI) & !is.na(new_df$mstatusd),new_df$PI*new_df$mstatusd,NA) 
new_df$PI_2_mstatusd <- ifelse(!is.na(new_df$PI) & !is.na(new_df$mstatusd),new_df$PI_2*new_df$mstatusd,NA) 
new_df$PI_2_mstatusd_gender <- ifelse(!is.na(new_df$PI) & !is.na(new_df$mstatusd) & !is.na(new_df$genderd),new_df$PI_2_mstatusd*new_df$genderd,NA) 


# Explore result when imputing missing frailty categories using KNN
# Default option is no imputation

Impute_missing_frailty <-0

if (Impute_missing_frailty ==1){

### Impute Missing values of frailty categories of frailty
## This could be not done. check

new_df$missing_count <- rowSums(is.na(new_df[, v_t_keep]))

#Get Death to have all the problems
new_df <- new_df %>% 
  mutate(across(all_of(v_t_keep), ~ifelse(frailty_bl == 1, 2, .)))


data<- as.matrix(new_df[,v_t_keep])

# Impute missing values using KNN
imputed_data<- KNNimp(data, k = 15, scale = FALSE, meth = "median", distData = NULL)
imputed_data2<-as.data.frame(imputed_data) %>% mutate(across(all_of(v_t_keep), ~ ifelse(. == 0.5, 1, .)))


# Convert the result back to a data frame
imputed_data2 <- as.data.frame(imputed_data2)


df_imputed <- new_df %>%
  select(-one_of(v_t_keep)) %>%
  bind_cols(imputed_data2)

df_imputed$frailty_new <-  rowSums(df_imputed[,v_t_keep])


summary_stats <- df_imputed %>%
  filter(Death == 0) %>%
  group_by(n_obs) %>%
  summarise(
    mean_frailty = mean(frailty_new, na.rm = TRUE),
    median_frailty = median(frailty_new, na.rm = TRUE),
    var_frailty = var(frailty_new, na.rm = TRUE),
    sd_frailty = sd(frailty_new, na.rm = TRUE),
    p25_frailty = quantile(frailty_new, 0.25, na.rm = TRUE),
    p75_frailty = quantile(frailty_new, 0.75, na.rm = TRUE),
    N = n(),
    mean_age = mean(agey_e, na.rm = TRUE)
  )

summary_stats_d <- df_imputed %>%
  filter(Death == 1) %>%
  group_by(n_obs) %>%
  summarise(
    death_index = mean(frailty_new, na.rm = TRUE),
    N_death = n()
  )

merged_summary <- full_join(summary_stats, summary_stats_d, by = "n_obs") %>% mutate(total_N=  N + N_death) %>% filter(n_obs<=5)

}

if (Impute_missing_frailty ==0){
  df_imputed <- new_df %>%
    select(-one_of(v_t_keep))
}


#####  Final Data Processing

df_proc <- df_imputed  %>%
  arrange(rahhidpn, agey_e) %>%
  group_by(rahhidpn) %>%
  mutate(lead_srhs = lead(srhs),
         cluster_sel=ifelse(cluster_52_frailty_bl==3  | cluster_52_frailty_bl==5,1,0)) 

end_time <- Sys.time()
runtime <- end_time-start_time
print(runtime)

# Remove all objects from the workspace
rm(list = setdiff(ls(), c(to_keep, "df_proc","df")))
