# Need to upload data
setwd(main_path)

### Import relevant libraries
library(tidyverse)
library(haven)
library(stats)
library(ggplot2)
library(openxlsx)
library(factoextra)

#Store Environment
to_keep <- ls()
start_time <- Sys.time()

tracker<- read_dta("rawdata/trk2022tr_r.dta") 

names(tracker) <- tolower(names(tracker))
tracker_birth_year<- tracker %>% mutate(rahhidpn = as.numeric(paste0(hhid, pn))) %>% 
  select(hhid,pn,rahhidpn,birthyr) %>% mutate(birthyr=ifelse(birthyr==0,NA,birthyr))


df <- read_dta("dtafiles/CleanPanelBalanced.dta") %>%  mutate(shlt = ifelse(Dead==1, 6, shlt), frailty_bl = frailty32) %>% arrange(rahhidpn,year)
  
df <- left_join(df,tracker_birth_year, by = "rahhidpn") %>% mutate(birthyear=birthyr) %>% 
  mutate(agey_e = ifelse(!is.na(agey_e), agey_e, year-birthyear),
         agey_e2= year-birthyear)

##########################
###
dff <- df %>%  filter(between(agey_e, 52, 61)) %>% arrange(rahhidpn,year) %>% group_by(rahhidpn) %>% 
  mutate(n_obs = row_number(), age_init=min(agey_e),max_nobs=max(n_obs),age_second=sort(agey_e)[2],
         missing_frailty=sum(is.na(frailty_bl)),F4_Age=lead(agey_e,4),
         F5_Age=lead(agey_e,5),
         frailty=frailty_bl,
         F_frailty = lead(frailty_bl,1),
         F2_frailty= lead(frailty_bl,2),
         F3_frailty= lead(frailty_bl,3),
         F4_frailty= lead(frailty_bl,4),
         F5_frailty= lead(frailty_bl,5),
         five_row= ifelse(!is.na(frailty) & !is.na(F_frailty) & !is.na(F2_frailty) & !is.na(F3_frailty) & !is.na(F4_frailty),1,0),
         last_row=ifelse(!is.na(F_frailty) & !is.na(F2_frailty) & !is.na(F3_frailty) & !is.na(F4_frailty) & !is.na(F5_frailty),1,0),
         death_1 =ifelse(frailty==1,1,0),
         death_2 =ifelse(F_frailty==1,1,0)) %>%
  select(rahhidpn,year,birthyear,n_obs,age_init,max_nobs,age_second,F4_Age,F5_Age,
         missing_frailty,Dead,frailty,F_frailty,F2_frailty,F3_frailty,F4_frailty,F5_frailty,five_row,last_row,death_1,death_2) %>% filter(n_obs==1)

### Generate conditions for inclusion ->
dff <- dff %>% mutate(crit_1= ifelse(age_init<54 & five_row ==1 & death_1==0,1,0),
                      crit_2= ifelse(age_second<54 & five_row==0 & last_row==1 & death_2==0,1,0)) %>% filter(crit_1==1 | crit_2==1 )




## Strict sample: Age 52-53
df_strict<- dff %>% mutate(frailty=ifelse(crit_2==1, F_frailty,frailty),
                           F_frailty=ifelse(crit_2==1, F2_frailty,F_frailty),
                           F2_frailty=ifelse(crit_2==1, F3_frailty,F2_frailty),
                           F3_frailty=ifelse(crit_2==1, F4_frailty,F3_frailty),
                           F4_frailty=ifelse(crit_2==1, F5_frailty,F4_frailty),
                           Initial_Age=ifelse(crit_2==1, age_second,age_init),
                           Final_Age=ifelse(crit_2==1, F5_Age,F4_Age)) %>%
                           select(rahhidpn,frailty,F_frailty,F2_frailty,F3_frailty,F4_frailty,Initial_Age,Final_Age) %>% ungroup()
#Export data in wide format:
write_dta(df_strict, path = paste0("dtafiles/data_clustering.dta"))
write_dta(df_strict, path = paste0("dtafiles/data_clustering.csv"))

## Run Clustering  
cluster <- function(var, k, dta){
  subdf <- dta %>% select("rahhidpn")
  filtered <- dta %>% select(ends_with(var)) %>% as.matrix()
  cresult <- kmeans(filtered, k, iter.max = 50, nstart = 1000)
  cent <- cresult$centers
  order <- apply(cent, MARGIN = 1, mean)
  order <- rank(order)
  ids <- cresult$cluster
  a<-order[ids]
  subdf$temp <-a 
  name <- paste0("cluster_52_frailty_bl")
  subdf<-subdf %>% rename_at("temp", ~name)
  return(subdf)
}

for(i in 2:10){
  var <- c('frailty')
  dta <- df_strict
  clustered <- cluster(var, i, dta)
  clustered_out<-inner_join(dta, clustered, by = "rahhidpn") %>% select(rahhidpn,cluster_52_frailty_bl,Initial_Age,Final_Age)
  dta_out <- inner_join(df, clustered_out, by = "rahhidpn")
  write_dta(dta_out, path = paste0("dtafiles/P52_",i,"_Clusters.dta"))
  write_dta(dta_out, path = paste0("dtafiles/P52_",i,"_Clusters.csv"))
}


################################################################################
## Appendix A: Create Tables with Data Selection
################################################################################

# Total Individuals
T_final <- df %>%  filter(between(agey_e, 52, 61)) %>% arrange(rahhidpn,year) %>% group_by(rahhidpn) %>% 
  mutate(n_obs = row_number(), age_init=min(agey_e),max_nobs=max(n_obs),age_second=sort(agey_e)[2],
         missing_frailty=sum(is.na(frailty_bl)),F4_Age=lead(agey_e,4),
         F5_Age=lead(agey_e,5),
         frailty=frailty_bl,
         F_frailty = lead(frailty_bl,1),
         F2_frailty= lead(frailty_bl,2),
         F3_frailty= lead(frailty_bl,3),
         F4_frailty= lead(frailty_bl,4),
         F5_frailty= lead(frailty_bl,5),
         five_row= ifelse(!is.na(frailty) & !is.na(F_frailty) & !is.na(F2_frailty) & !is.na(F3_frailty) & !is.na(F4_frailty),1,0),
         last_row=ifelse(!is.na(F_frailty) & !is.na(F2_frailty) & !is.na(F3_frailty) & !is.na(F4_frailty) & !is.na(F5_frailty),1,0),
         death_1 =ifelse(frailty==1,1,0),
         death_2 =ifelse(F_frailty==1,1,0)) %>%
  select(rahhidpn,year,birthyear,n_obs,age_init,max_nobs,age_second,F4_Age,F5_Age,
         missing_frailty,Dead,frailty,F_frailty,F2_frailty,F3_frailty,F4_frailty,F5_frailty,five_row,last_row,death_1,death_2) %>% filter(n_obs==1)


### Generate conditions for inclusion ->
T_final_appendix <- T_final %>% mutate(crit_1= ifelse(age_init<54 & five_row ==1 & death_1==0,1,0),
                              crit_2= ifelse(age_second<54 & five_row==0 & last_row==1 & death_2==0,1,0)) 

T_final_appendix <- T_final_appendix %>% mutate(crit_v1= ifelse(age_init<54,1,0),
                                      any_frailty_info=ifelse(!is.na(frailty)| !is.na(F_frailty) | !is.na(F2_frailty)
                                                                 | !is.na(F3_frailty) | !is.na(F4_frailty) | !is.na(F5_frailty),1,0),
                                      death_at_beg= ifelse(!is.na(Dead),Dead, 
                                                           ifelse(any_frailty_info==1,0,NA)),
                                      crit_v2=ifelse(!is.na(F4_frailty)|!is.na(F5_frailty),1,0))

seletion_1<-T_final_appendix %>% filter(crit_v1==1 & death_at_beg==0)
seletion_2<-T_final_appendix %>% filter(crit_v1==1 & death_at_beg==0 & crit_v2==1)
seletion_3<-T_final_appendix %>% filter(crit_v1==1 & death_at_beg==0 & crit_v2==1) %>%filter(crit_1==1 | crit_2==1)


###############################################################################################
## Generate Table 8: Number of individuals each step approaching the sample used for clustering
###############################################################################################
full_sample_count <- df %>% distinct(rahhidpn) %>% nrow()
in_sample_52_53_count <- seletion_1 %>% distinct(rahhidpn) %>% nrow()
in_sample_60_61_count <- seletion_2 %>% distinct(rahhidpn) %>% nrow()
in_sample_constructible_count <- seletion_3 %>% distinct(rahhidpn) %>% nrow()

# Create a summary table
summary_table <- data.frame(
  Step = c(
    "Full sample",
    "Alive and observed at age 52-53",
    "Observed at age 60-61 or older by 2018",
    "Constructible frailty index during clustering"
  ),
  'Number of Individuals' = c(
    full_sample_count,
    in_sample_52_53_count,
    in_sample_60_61_count,
    in_sample_constructible_count
  )
)
# Rename the column to have spaces
colnames(summary_table)[2] <- "Number of individuals"

# Export as a LaTeX table
latex_table <- summary_table %>%
  kable(
    "latex", 
    booktabs = TRUE,  # Use booktabs for clean formatting
    col.names = c("Step", "Number of individuals"), 
    align = c("l", "|c")  # Align first column left, others center
  ) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE) %>%
  row_spec(0, align = "c")  # Center-align only the header row

# View LaTeX code
print(latex_table)

# Save to a file
cat(latex_table, file = "output/Part2_output/Part2_a_clustering/tab8.tex")



################################################################################################################
##Generate Table 9: Number of individuals each step approaching the sample used used for predictive exercise
################################################################################################################

file_name<- c('dtafiles/P52_5_Clusters.dta')

labs = c("age", "age$^{2}$/100", "age$^{3}$/10000", "GED", "HS", "HS/GED", "Associate's",
         "Bachelor's", "Master's", "Doctorate", "Black", "Other Non-White", 
         "Woman", "Cohort 5", "Married", "Type 2", "Type 3", "Type 4", "Type 5",
         "$f_{0}$", "$s_{0}$")

df_raw <- read_dta(file_name)
for(i in c(4,5)){
  # Generate the file path
   file_path<- paste0("dtafiles/P52_",i,"_Clusters.dta")

   # Read the data from the file
  df_raw_new <- read_dta(file_path)
  
  # Create a new column in df_raw with the name "cluster_i" and store the values
  df_raw <- df_raw %>%
    mutate(!!paste0("cluster_",i) := as.factor(df_raw_new$cluster_52_frailty_bl))
}


df_base <- df_raw %>% mutate(hcpl = ifelse(is.na(hcpl), as.numeric(remstat<=3), hcpl),
                             age = agey_e, age2 = agey_e^2/100, age3 = agey_e^3/10000) %>%
  mutate(hhearn = ifelse(hcpl==1, iearn+iearnspouse, iearn), cluster = as.factor(cluster_52_frailty_bl)) %>% 
  filter(!(is.na(hcpl)|is.na(frailty_bl)|is.na(cluster_52_frailty_bl))) %>%
  filter(agey_e>=Initial_Age) %>% arrange(rahhidpn, year) %>% group_by(rahhidpn) %>%
  mutate(frailty_trunc = ifelse(row_number()==1, frailty_bl, NA),
         shlt_trunc = ifelse(row_number()==1, shlt, NA)) %>%
  mutate(frailty_mean = mean(frailty_trunc, na.rm = TRUE),
         shlt_mean = mean(shlt_trunc, na.rm = TRUE)) %>%
  mutate(lag_frailty = lag(frailty_bl,1),
         lag_srhs = lag(shlt,1)) %>%
  mutate(frailty_end = ifelse(agey_e==Final_Age,frailty_bl,NA),
         frailty_end2= mean(frailty_end, na.rm = TRUE)) %>%
  filter(Dead[5]==0) %>% ungroup()


# Define Data to use in regresions  
df_frail <- df_base %>% filter(Dead==0) %>% group_by(rahhidpn) %>%
  filter(agey_e>Final_Age) %>% ungroup()

df_base2 <- df_raw %>% mutate(hcpl = ifelse(is.na(hcpl), as.numeric(remstat<=3), hcpl),
                              age = agey_e, age2 = agey_e^2/100, age3 = agey_e^3/10000) %>%
  mutate(hhearn = ifelse(hcpl==1, iearn+iearnspouse, iearn), cluster = as.factor(cluster_52_frailty_bl)) %>% 
  filter(agey_e>=Initial_Age) %>% arrange(rahhidpn, year) %>% 
  group_by(rahhidpn) %>%
  mutate(posit=row_number()) %>%
  mutate(frailty_trunc = ifelse(row_number()==1, frailty_bl, NA),
         shlt_trunc = ifelse(row_number()==1, shlt, NA),
         frailty_trunc2 = ifelse(agey_e<=Final_Age, frailty_bl,NA)) %>%
  mutate(frailty_mean = mean(frailty_trunc, na.rm = TRUE),
         shlt_mean = mean(shlt_trunc, na.rm = TRUE),
         frailty_true_mean=mean(frailty_trunc2, na.rm = TRUE))  %>%
  mutate(lag_frailty = lag(frailty_bl,1),
         lead_frailty=lead(frailty_bl,1),
         lag_srhs = lag(shlt,1),
         lead_srhs=lead(shlt,1),
         lead_death=lead(Dead,1)) %>%
  mutate(frailty_end = ifelse(agey_e==Final_Age,frailty_bl,NA),
         frailty_end2= mean(frailty_end, na.rm = TRUE)) %>%
  mutate(died_clustering=ifelse(agey_e==Final_Age,Dead,NA),
         Died_clustering=mean(died_clustering, na.rm = TRUE))%>%
  mutate(log_frailty = ifelse(!is.na(frailty_bl) & frailty_bl>0,log(frailty_bl),NA)) %>%
  mutate(cond=ifelse(is.na(hcpl)|is.na(frailty_bl)|is.na(cluster_52_frailty_bl)|is.na(raedegrm)|is.na(ragender)|is.na(cohort)|is.na(age)
                     ,0,1)) %>% 
  ungroup()


#All individuals with assigned health type
Predicitive_0 <- df_base2 %>% filter(agey_e>=Initial_Age) %>% filter(agey_e<=Final_Age) 
Predicitive_0_id <- df_base2 %>% select(rahhidpn) %>% unique()

Predicitive_1 <- df_base2 %>% filter(agey_e>Final_Age)
Predicitive_1_id <- df_base2 %>% filter(agey_e>Final_Age) %>% select(rahhidpn) %>% unique()

#Alive at 60-61
Predicitive_2 <- df_base2 %>% filter(Died_clustering==0) 
Predicitive_2_id <- df_base2 %>% filter(Died_clustering==0) %>% select(rahhidpn) %>% unique()

#Alive at 60-61 and with at least 1 realization after age 60-61
Predicitive_3 <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
                filter(!is.na(frailty_bl))

Predicitive_3_id <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
  filter(!is.na(frailty_bl)) %>% select(rahhidpn) %>% unique()

#Alive at 60-61 and with at least 1 realization after age 60-61 and with non-missing information
Predicitive_4 <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
   filter(!is.na(frailty_bl) & cond==1) 

Predicitive_4_id <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
  filter(!is.na(frailty_bl) & cond==1) %>% select(rahhidpn) %>% unique()


#Alive at 60-61 and with at least 1 realization after age 60-61 and with non-missing information and Alive
Predicitive_5 <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
    filter(!is.na(frailty_bl) & cond==1 & Dead==0) 

Predicitive_5_id <- df_base2 %>% filter(Died_clustering==0) %>% filter(agey_e>Final_Age) %>% 
    filter(!is.na(frailty_bl) & cond==1 & Dead==0) %>% select(rahhidpn) %>% unique()

# Step 1: Calculate counts for each step
assigned_health_type_individuals <- nrow(Predicitive_0_id)
assigned_health_type_observations <- nrow(Predicitive_0)

alive_at_60_61_individuals <- nrow(Predicitive_2_id)
alive_at_60_61_observations <- nrow(Predicitive_2)

realization_after_60_61_individuals <- nrow(Predicitive_3_id)
realization_after_60_61_observations <- nrow(Predicitive_3)

non_missing_info_individuals <- nrow(Predicitive_4_id)
non_missing_info_observations <- nrow(Predicitive_4)

non_missing_info_alive_individuals <- nrow(Predicitive_5_id)
non_missing_info_alive_observations <- nrow(Predicitive_5)

# Step 2: Create a summary table with proper column names and blanks for the first three rows
summary_table <- data.frame(
  Step = c(
    "Assigned a health type",
    "Alive at 60-61",
    "At least one health realization after age 60-61",
    "Non-missing information",
    "Non-missing information and alive"
  ),
  `Number of Individuals` = c(
    assigned_health_type_individuals,
    alive_at_60_61_individuals,
    realization_after_60_61_individuals,
    non_missing_info_individuals,
    non_missing_info_alive_individuals
  ),
  `N of Observations (for prediction)` = c(
    "",  # Blank for the first row
    "",  # Blank for the second row
    "",  # Blank for the third row
    non_missing_info_observations,
    non_missing_info_alive_observations
  )
)

# Rename the column to have spaces
colnames(summary_table)[2] <- "Number of individuals"
colnames(summary_table)[3] <- "N of observations"

# Export as a LaTeX table
latex_table <- summary_table %>%
  kable(
    "latex", 
    booktabs = TRUE,  # Use booktabs for clean formatting
    col.names = c("Step", "Number of individuals", "N of observations"), 
    align = c("l", "|c", "|c")  # Align first column left, others center
  ) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE) %>%
  row_spec(0, align = "c")  # Center-align only the header row


# View LaTeX code
print(latex_table)

# Save the table to a LaTeX file
cat(latex_table, file = "output/Part2_output/Part2_a_clustering/tab9.tex")




################################################################################
## Appendix C
################################################################################
library(cluster)
library(factoextra)
library(dplyr)

# Assuming data is loaded into 'data'
var <- c('frailty')
dta <- df_strict
subdf <- dta %>% select("rahhidpn")
filtered <- dta %>% select(ends_with(var)) %>% as.matrix()

# Maximun K
k_max<-15

################################################################################
## Appendix C: Elbow Analysis:
################################################################################
# Perform K-means clustering for different values of K
k_values <- 2:k_max  # Example range of K values
kmeans_results <- lapply(k_values, function(k) kmeans(filtered, centers = k,iter.max = 50, nstart = 1000))

# Compute total within-cluster sum of squares (elbow criterion)
wss <- sapply(kmeans_results, function(kmeans_model) 1-kmeans_model$tot.withinss/kmeans_model$totss)


k_new<- 1:k_max
wss2 <- c(0,wss) 

# Create a data frame for plotting
df <- data.frame(k_values = k_new, wss = wss2)

# Plot using ggplot2
ggplot(df, aes(x = k_values, y = wss)) +
  geom_line(size = 1.15) +
  labs(x = "Number of clusters (K)",
       y = "Fraction of total variance explained by clusters") +
  theme_bw() +
  theme(axis.title = element_text(size = 16),  # Increase axis label font size
        axis.text = element_text(size = 14),  # Increase axis tick label font size
        panel.grid.major.x = element_line(color = "gray", size = 0.2),  # Remove major x-axis gridlines
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray", size = 0.1),  # Remove major x-axis gridlines
        panel.grid.minor.y = element_blank()) + 
  scale_x_continuous(breaks = k_new) +
  scale_y_continuous(breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0), limits = c(0, 1)) +
  # Add transparent blue bar zone between x=4 and x=6
  geom_rect(xmin = 2, xmax = 5, ymin = 0, ymax = 1, fill = "grey", alpha = 0.03) 
# Export Plot
ggsave(
  filename = "output/Part2_output/Part2_a_clustering/fig9-1.pdf",
  width = 6,  # Adjust width as needed for your publication
  height = 6,   # Adjust height as needed for your publication
  dpi = 300,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
  device = "pdf"  # Use Cairo for advanced PDF output
)
# Conditional EPS export
if (eps == 1) {
  ggsave(
    filename = "output/Part2_output/Part2_a_clustering/fig9-1.eps",
    width = 6,  # Adjust width as needed for your publication
    height = 6,   # Adjust height as needed for your publication
    dpi = 300,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
    device = cairo_ps  # Use Cairo for advanced PDF output
  )
}
################################################################################
## Appendix C: Silhouette Analysis:
################################################################################
clust_stats <- fviz_nbclust(filtered, kmeans, method = "silhouette", 
                            k.max = 15, iter.max = 50, nstart = 1000)

clust_stats_data <-clust_stats$data

# Plot using ggplot2
ggplot(clust_stats_data, aes(x = as.numeric(clusters), y = y)) +
  geom_line(size = 1.15) +
  labs(x = "Number of clusters (K)",
       y = "Average silhoutte width") +
  theme_bw() +
  theme(axis.title = element_text(size = 16),  # Increase axis label font size
        axis.text = element_text(size = 14),  # Increase axis tick label font size
        panel.grid.major.x = element_line(color = "gray", size = 0.2),  # Remove major x-axis gridlines
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray", size = 0.1),  # Remove major x-axis gridlines
        panel.grid.minor.y = element_blank()) + 
  scale_x_continuous(breaks = k_new) +
  scale_y_continuous(breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0), limits = c(0, 1)) +
  # Add transparent blue bar zone between x=4 and x=6
  #geom_rect(xmin = 1.95, xmax = 2.05, ymin = 0, ymax = 1, fill = "grey", alpha = 0.03) 
    # Add vertical dashed line at X=2
  geom_vline(xintercept = 2, linetype = "dashed", color = "black", size = 1)
# Export Plot
ggsave(
  filename = "output/Part2_output/Part2_a_clustering/fig9-2.pdf",
  width = 6,  # Adjust width as needed for your publication
  height = 6,   # Adjust height as needed for your publication
  dpi = 300,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
  device = "pdf"  # Use Cairo for advanced PDF output
)

# Conditional EPS export
if (eps == 1) {
  ggsave(
    filename = "output/Part2_output/Part2_a_clustering/fig9-2.eps",
    width = 6,  # Adjust width as needed for your publication
    height = 6,   # Adjust height as needed for your publication
    dpi = 300,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
    device = cairo_ps  # Use Cairo for advanced PDF output
  )
}

end_time <- Sys.time()
runtime <- end_time-start_time
print(runtime)


#Clear environment
rm(list = setdiff(ls(), c(to_keep)))
