setwd(main_path)

### Import relevant libraries
library(tidyverse)
library(haven)
library(stargazer)
library(DescTools)
library(latex2exp)
##


#Store Environment
to_keep <- ls()
start_time <- Sys.time()

file_name<- c('dtafiles/P52_5_Clusters.dta')

labs = c("age", "age$^{2}$/100", "age$^{3}$/10000", "GED", "HS", "HS/GED", "Associate's",
         "Bachelor's", "Master's", "Doctorate", "Black", "Other Non-White", 
         "Woman", "Cohort 5", "Married", "Type 2", "Type 3", "Type 4", "Type 5",
         "$f_{0}$", "$s_{0}$")

df_raw <- read_dta(file_name)
for(i in 2:10){
  # Generate the file path
  file_path<- paste0("dtafiles/P52_",i,"_Clusters.dta")
  
  # Read the data from the file
  df_raw_new <- read_dta(file_path)
  
  # Create a new column in df_raw with the name "cluster_i" and store the values
  df_raw <- df_raw %>%
    mutate(!!paste0("cluster_",i) := as.factor(df_raw_new$cluster_52_frailty_bl))
}


df_base2 <- df_raw %>% mutate(hcpl = ifelse(is.na(hcpl), as.numeric(remstat<=3), hcpl),
                              age = agey_e, age2 = agey_e^2/100, age3 = agey_e^3/10000) %>%
  mutate(hhearn = ifelse(hcpl==1, iearn+iearnspouse, iearn), cluster = as.factor(cluster_52_frailty_bl)) %>% 
  filter(agey_e>=Initial_Age) %>% arrange(rahhidpn, year) %>% 
  group_by(rahhidpn) %>%
  mutate(posit=row_number()) %>%
  mutate(frailty_trunc = ifelse(row_number()==1, frailty_bl, NA),
         shlt_trunc = ifelse(row_number()==1, shlt, NA),
         frailty_trunc2 = ifelse(agey_e<=Final_Age, frailty_bl,NA)) %>%
  mutate(frailty_mean = mean(frailty_trunc, na.rm = TRUE),
         shlt_mean = mean(shlt_trunc, na.rm = TRUE),
         frailty_true_mean=mean(frailty_trunc2, na.rm = TRUE))  %>%
  mutate(lag_frailty = lag(frailty_bl,1),
         lead_frailty=lead(frailty_bl,1),
         lag_srhs = lag(shlt,1),
         lead_srhs=lead(shlt,1),
         lead_death=lead(Dead,1)) %>%
  mutate(frailty_end = ifelse(agey_e==Final_Age,frailty_bl,NA),
         frailty_end2= mean(frailty_end, na.rm = TRUE)) %>%
  mutate(died_clustering=ifelse(agey_e==Final_Age,Dead,NA),
         Died_clustering=mean(died_clustering, na.rm = TRUE))%>%
  mutate(log_frailty = ifelse(!is.na(frailty_bl) & frailty_bl>0,log(frailty_bl),NA)) %>%
  mutate(cond=ifelse(is.na(hcpl)|is.na(frailty_bl)|is.na(cluster_52_frailty_bl)|is.na(raedegrm)|is.na(ragender)|is.na(cohort)|is.na(age)
                     ,0,1)) %>% 
  ungroup()

#Alive at 60-61 and with at least 1 realization after age 60-61 and with non-missing information
df_mort <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
  filter(!is.na(frailty_bl) & cond==1) 

#Alive at 60-61 and with at least 1 realization after age 60-61 and with non-missing information and Alive
df_frail <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
  filter(!is.na(frailty_bl) & cond==1 & Dead==0) 

################################################################################
# Appendix D: Tables 12 and 13
################################################################################

## Forecasting frailty after the clustering period
freg1 <- lm(frailty_bl ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
              hcpl, data = df_frail)
r_squared_k_1<-summary(freg1)$adj.r.squared

freg2 <- lm(frailty_bl ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
              hcpl + factor(cluster_5), data = df_frail)  
r_squared_k_2<-summary(freg2)$adj.r.squared

freg3 <- lm(frailty_bl ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
              hcpl + frailty_mean + shlt_mean, data = df_frail)  
r_squared_k_3<-summary(freg3)$adj.r.squared

freg4 <- lm(frailty_bl ~  age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
              hcpl + frailty_mean + shlt_mean + factor(cluster_5), data = df_frail)
r_squared_k_4<-summary(freg4)$adj.r.squared

stargazer(freg1, freg2, freg3, freg4, type = "latex", no.space = T, digits = 3,
          covariate.labels = labs, out = "output/Part2_output/Part2_g_predictive_exc/tab12.tex", omit.stat = c("f", "ser"),
          font.size = "scriptsize",
          title = "Regressions of future health on demographics, health type, and initial frailty and self-reported health.", 
          label = "tab:health2")

##  Forecasting mortality after the clustering period
mreg1 <- glm(Dead ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
               hcpl, data = df_mort, family = binomial)
summary(mreg1)
pseudo_r_squared_k_1<-PseudoR2(mreg1)

mreg2 <- glm(Dead ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
               hcpl + factor(cluster_5), data = df_mort, family = binomial)
summary(mreg2)
pseudo_r_squared_k_2<-PseudoR2(mreg2)

mreg3 <- glm(Dead ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
               hcpl+ frailty_mean + shlt_mean, data = df_mort, family = binomial)
summary(mreg3)
pseudo_r_squared_k_3<-PseudoR2(mreg3)


mreg4 <- glm(Dead ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) +
               hcpl + frailty_mean + shlt_mean + factor(cluster_5), data = df_mort, family = binomial)
summary(mreg4)
pseudo_r_squared_k_4<-PseudoR2(mreg4)

stargazer(mreg1, mreg2, mreg3, mreg4, type = "latex", no.space = T, digits = 3, 
          add.lines = list(c("$\\rho^{2}$", round(PseudoR2(mreg1), digits = 3), 
                             round(PseudoR2(mreg2), digits = 3), round(PseudoR2(mreg3), digits = 3),
                             round(PseudoR2(mreg4), digits = 3))), 
          covariate.labels = labs, out = "output/Part2_output/Part2_g_predictive_exc/tab13.tex", omit.stat = c("aic"),
          font.size = "scriptsize",
          title = "Regressions of future mortality on demographics, health type, and initial frailty and self-reported health.", 
          label = "tab:mort2")


################################################################################
# Table 5
################################################################################
r2 <- c(r_squared_k_1, r_squared_k_2, r_squared_k_3, r_squared_k_4)  # R^2 values
pseudo_r2 <- c(pseudo_r_squared_k_1, pseudo_r_squared_k_2, pseudo_r_squared_k_3, pseudo_r_squared_k_4)  # Pseudo-R^2 values

# Format numbers to have 3 decimal places
r2 <- format(round(r2, 3), nsmall = 3)
pseudo_r2 <- format(round(pseudo_r2, 3), nsmall = 3)

# Create the table data as a data frame
table_data <- data.frame(
  Category = c(
    "\\textit{Controls}",
    "\\textit{Initial health}",
    "\\textit{Health types}",
    "$R^2$",
    "Pseudo-$R^2$"
  ),
  Future_Frailty_1 = c("x", "", "", r2[1], ""),
  Future_Frailty_2 = c("x", "", "x", r2[2], ""),
  Future_Frailty_3 = c("x", "x", "", r2[3], ""),
  Future_Frailty_4 = c("x", "x", "x", r2[4], ""),
  Future_Mortality_1 = c("x", "", "", "", pseudo_r2[1]),
  Future_Mortality_2 = c("x", "", "x", "", pseudo_r2[2]),
  Future_Mortality_3 = c("x", "x", "", "", pseudo_r2[3]),
  Future_Mortality_4 = c("x", "x", "x", "", pseudo_r2[4])
)

# Generate the LaTeX table
latex_table <- kable(
  table_data, 
  format = "latex", 
  booktabs = TRUE, 
  escape = FALSE, 
  col.names = c("", "(1)", "(2)", "(3)", "(4)", "(1)", "(2)", "(3)", "(4)")
) %>%
  add_header_above(c(" " = 1, "Future Frailty" = 4, "Future Mortality" = 4)) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE) %>%
  row_spec(3, hline_after = TRUE)  # Add a line after row 3

# Save the table to a .tex file
writeLines(latex_table, "output/Part2_output/Part2_g_predictive_exc/tab2.tex")

################################################################################
# Appendix D: Figures 10 and 11
################################################################################
## Expand the analysis with different values of K

# Create an empty matrix to store the results
results_matrix <- matrix(NA, nrow = 0, ncol = 5)

tab<- c("1",r_squared_k_1,r_squared_k_3,pseudo_r_squared_k_1,pseudo_r_squared_k_3)

results_matrix <- matrix(NA, nrow = 0, ncol = 5)
results_matrix <- rbind(results_matrix, tab)

var_values <- as.numeric(1)

for (i in 2:10) {  # Change the range as needed
  # Generate the file path
  var <- paste0("cluster_",i,"")
  var2 <- paste0(i)
  var_values <- c(var_values, var2)
  
  formula_string <- paste("frailty_bl ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) + hcpl +", var)
  
  freg_k <- lm(as.formula(formula_string), data = df_frail)
  
  r_squared_k_2 <- summary(freg_k)$adj.r.squared
  
  formula_string2 <- paste("frailty_bl ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) + hcpl +", var,"+ frailty_mean + shlt_mean")
  
  freg_k_2 <- lm(as.formula(formula_string2), data = df_frail)
  
  r_squared_k_4 <- summary(freg_k_2)$adj.r.squared
  
  formula_string <- paste("Dead ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) + hcpl +", var)
  
  mreg_k <- glm(as.formula(formula_string), data = df_mort, family = binomial)
  
  pseudo_r_squared_k_2 <- PseudoR2(mreg_k)
  
  formula_string2 <- paste("Dead ~ age + age2 + age3 + factor(raedegrm) + factor(raracem) + ragender + factor(cohort) + hcpl +", var,"+ frailty_mean + shlt_mean")
  
  mreg_k_2 <- glm(as.formula(formula_string2), data = df_mort, family = binomial)
  
  pseudo_r_squared_k_4 <- PseudoR2(mreg_k_2)
  
  tab<- c(var2,r_squared_k_2,r_squared_k_4,pseudo_r_squared_k_2,pseudo_r_squared_k_4)
  
  results_matrix <- rbind(results_matrix, tab)
  
}  


col_names <- c("Cluster", "F1", "F2", "M1", "M2")
# Set column names for the results_matrix
colnames(results_matrix) <- col_names

rownames(results_matrix) <- var_values

results_matrix<-data.frame(results_matrix) %>% mutate(Cluster=as.numeric(Cluster),
                                                      F1= as.numeric(F1),
                                                      F2= as.numeric(F2),
                                                      M1=as.numeric(M1),
                                                      M2=as.numeric(M2))

ggplot(aes(x = Cluster, y = F1), data = results_matrix) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = 5, color = "red", linetype = "dashed", linewidth = 1.15) +  # Add the dashed vertical line at x = 5
  ylab(TeX("Adjusted $R^2$")) +  # LaTeX-style calligraphic R for y-axis
  xlab("Number of Clusters") +  # x-axis label
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75), limits = c(0, 0.75)) +
  scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  theme_bw() +
  theme(
    axis.title = element_text(size = 16),  # Increase axis label font size
    axis.text = element_text(size = 14),  # Increase axis tick label font size
    panel.grid.major.x = element_line(color = "gray", linewidth = 0.2),  # Gray major x-axis gridlines
    panel.grid.minor.x = element_blank(),  # Remove minor x-axis gridlines
    panel.grid.major.y = element_line(color = "gray", linewidth = 0.1),  # Gray major y-axis gridlines
    panel.grid.minor.y = element_blank()   # Remove minor y-axis gridlines
  ) +
  theme(text = element_text(size = 14))  # Set text size for labels
ggsave(filename = "fig10-1.pdf", path = "output/Part2_output/Part2_g_predictive_exc/", 
       device = "pdf", width = 5, height = 4)

# Conditional EPS export
if (eps == 1) {
  ggsave(filename = "fig10-1.eps", path = "output/Part2_output/Part2_g_predictive_exc/", 
         device = cairo_ps, width = 5, height = 4)
}


ggplot(aes(x = Cluster, y = F2) , data = results_matrix) +
  geom_line(linewidth  = 1) +
  geom_vline(xintercept = 5, color = "red" , linetype = "dashed", linewidth  = 1.15) +  # Add the red vertical line at x = 5
  ylab(TeX("Adjusted $R^2$")) +  # LaTeX-style calligraphic R for y-axis
  xlab("Number of Clusters") +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75), limits = c(0, 0.75))+
  scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  #scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  theme_bw() +
  theme(axis.title = element_text(size = 16),  # Increase axis label font size
        axis.text = element_text(size  = 14),  # Increase axis tick label font size
        panel.grid.major.x = element_line(color = "gray", linewidth  = 0.2),  # Remove major x-axis gridlines
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray", linewidth  = 0.1),  # Remove major x-axis gridlines
        panel.grid.minor.y = element_blank())  # Customize minor x-axis gridlines
ggsave(filename = paste0("fig10-2.pdf"), path = "output/Part2_output/Part2_g_predictive_exc/", device = "pdf", width = 5, height = 4)
# Conditional EPS export
if (eps == 1) {
  ggsave(filename = "fig10-2.eps", path = "output/Part2_output/Part2_g_predictive_exc/", 
         device = cairo_ps, width = 5, height = 4)
}

ggplot(aes(x = Cluster, y = M1) , data = results_matrix) +
  geom_line(linewidth  = 1) +
  geom_vline(xintercept = 5, color = "red" , linetype = "dashed", linewidth  = 1.15) +  # Add the red vertical line at x = 5
  ylab(TeX("McFadden pseudo-$R^2$")) +  # LaTeX-style calligraphic R for y-axis
  xlab("Number of Clusters") +
  scale_y_continuous(breaks = c(0, 0.1,0.2,0.3), limits = c(0, 0.3))+
  scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  #scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  theme_bw() +
  theme(axis.title = element_text(size = 16),  # Increase axis label font size
        axis.text = element_text(size  = 14),  # Increase axis tick label font size
        panel.grid.major.x = element_line(color = "gray", linewidth  = 0.2),  # Remove major x-axis gridlines
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray", linewidth  = 0.1),  # Remove major x-axis gridlines
        panel.grid.minor.y = element_blank())  # Customize minor x-axis gridlines
ggsave(filename = paste0("fig11-1.pdf"), path = "output/Part2_output/Part2_g_predictive_exc/", device = "pdf", width = 5, height = 4)
# Conditional EPS export
if (eps == 1) {
  ggsave(filename = "fig11-1.eps", path = "output/Part2_output/Part2_g_predictive_exc/", 
         device = cairo_ps, width = 5, height = 4)
}
ggplot(aes(x = Cluster, y = M2) , data = results_matrix) +
  geom_line(linewidth  = 1) +
  geom_vline(xintercept = 5, color = "red" , linetype = "dashed", linewidth  = 1.15) +  # Add the red vertical line at x = 5
  ylab(TeX("McFadden pseudo-$R^2$")) +  # LaTeX-style calligraphic R for y-axis
  xlab("Number of Clusters") +
  scale_y_continuous(breaks = c(0, 0.1,0.2,0.3), limits = c(0, 0.3))+
  scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  #scale_x_continuous(breaks = unique(results_matrix$Cluster)) +
  theme_bw() +
  theme(axis.title = element_text(size = 16),  # Increase axis label font size
        axis.text = element_text(size  = 14),  # Increase axis tick label font size
        panel.grid.major.x = element_line(color = "gray", linewidth  = 0.2),  # Remove major x-axis gridlines
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray", linewidth  = 0.1),  # Remove major x-axis gridlines
        panel.grid.minor.y = element_blank())  # Customize minor x-axis gridlines
ggsave(filename = paste0("fig11-2.pdf"), path = "output/Part2_output/Part2_g_predictive_exc/", device = "pdf", width = 5, height = 4)
# Conditional EPS export
if (eps == 1) {
  ggsave(filename = "fig11-2.eps", path = "output/Part2_output/Part2_g_predictive_exc/", 
         device = cairo_ps, width = 5, height = 4)
}


end_time <- Sys.time()
runtime <- end_time-start_time
print(runtime)

#Clear enviroment
rm(list = setdiff(ls(), c(to_keep)))






