setwd(main_path)

#Import required libraries
library(tidyverse)
library(haven)
library(stats)
library(ggplot2)
library(openxlsx)
library(factoextra)

to_keep <- ls()
start_time <- Sys.time()

tracker<- read_dta("rawdata/trk2022tr_r.dta") 

names(tracker) <- tolower(names(tracker))
tracker_birth_year<- tracker %>% mutate(rahhidpn = as.numeric(paste0(hhid, pn))) %>% 
  select(hhid,pn,rahhidpn,birthyr) %>% mutate(birthyr=ifelse(birthyr==0,NA,birthyr))


df <- read_dta("dtafiles/CleanPanelBalanced.dta") %>%  mutate(shlt = ifelse(Dead==1, 6, shlt), frailty_bl = frailty32) %>% arrange(rahhidpn,year)

df <- left_join(df,tracker_birth_year, by = "rahhidpn") %>% mutate(birthyear=birthyr) %>% 
  mutate(agey_e = ifelse(!is.na(agey_e), agey_e, year-birthyear),
         agey_e2= year-birthyear)

##########################
## With 6 periods
dff <- df %>%  filter(between(agey_e, 52, 65)) %>% arrange(rahhidpn,year) %>% group_by(rahhidpn) %>% 
  mutate(n_obs = row_number(), age_init=min(agey_e),max_nobs=max(n_obs),age_second=sort(agey_e)[2],
         missing_frailty=sum(is.na(frailty_bl)),F4_Age=lead(agey_e,4),
         F5_Age=lead(agey_e,5),F6_Age=lead(agey_e,6),,F7_Age=lead(agey_e,7),
         frailty=frailty_bl,
         F_frailty = lead(frailty_bl,1),
         F2_frailty= lead(frailty_bl,2),
         F3_frailty= lead(frailty_bl,3),
         F4_frailty= lead(frailty_bl,4),
         F5_frailty= lead(frailty_bl,5),
         F6_frailty= lead(frailty_bl,6),
         F7_frailty= lead(frailty_bl,7),
         seven_row= ifelse(!is.na(frailty) & !is.na(F_frailty) & !is.na(F2_frailty) & !is.na(F3_frailty) & !is.na(F4_frailty) & !is.na(F5_frailty) & !is.na(F6_frailty),1,0),
         last_row=ifelse(!is.na(F_frailty) & !is.na(F2_frailty) & !is.na(F3_frailty) & !is.na(F4_frailty) & !is.na(F5_frailty) & !is.na(F6_frailty) & !is.na(F7_frailty),1,0),
         death_1 =ifelse(frailty==1,1,0),
         death_2 =ifelse(F_frailty==1,1,0)) %>%
  select(rahhidpn,year,birthyear,n_obs,age_init,max_nobs,age_second,F4_Age,F5_Age,F6_Age,F7_Age,
         missing_frailty,Dead,frailty,F_frailty,F2_frailty,F3_frailty,F4_frailty,F5_frailty,F6_frailty,F7_frailty,seven_row,last_row,death_1,death_2) %>% filter(n_obs==1)

### Generate conditions for inclusion ->
dff <- dff %>% mutate(crit_1= ifelse(age_init<54 & seven_row ==1 & death_1==0,1,0),
                      crit_2= ifelse(age_second<54 & seven_row==0 & last_row==1 & death_2==0,1,0)) %>% filter(crit_1==1 | crit_2==1 )


## Strict sample: Age 52-53
df_strict<- dff %>% mutate(frailty=ifelse(crit_2==1, F_frailty,frailty),
                           F_frailty=ifelse(crit_2==1, F2_frailty,F_frailty),
                           F2_frailty=ifelse(crit_2==1, F3_frailty,F2_frailty),
                           F3_frailty=ifelse(crit_2==1, F4_frailty,F3_frailty),
                           F4_frailty=ifelse(crit_2==1, F5_frailty,F4_frailty),
                           F5_frailty=ifelse(crit_2==1, F6_frailty,F5_frailty),
                           F6_frailty=ifelse(crit_2==1, F7_frailty,F6_frailty),
                           Initial_Age=ifelse(crit_2==1, age_second,age_init),
                           Final_Age=ifelse(crit_2==1, F6_Age,F5_Age)) %>%
  select(rahhidpn,frailty,F_frailty,F2_frailty,F3_frailty,F4_frailty,F5_frailty,,F6_frailty,Initial_Age,Final_Age) %>% ungroup()

## Run Clustering  
cluster <- function(var, k, dta){
  subdf <- dta %>% select("rahhidpn")
  filtered <- dta %>% select(ends_with(var)) %>% as.matrix()
  cresult <- kmeans(filtered, k, iter.max = 50, nstart = 1000)
  cent <- cresult$centers
  order <- apply(cent, MARGIN = 1, mean)
  order <- rank(order)
  ids <- cresult$cluster
  a<-order[ids]
  subdf$temp <-a 
  name <- paste0("cluster_52_frailty_bl")
  subdf<-subdf %>% rename_at("temp", ~name)
  return(subdf)
}

var <- c('frailty')
dta <- df_strict
clustered <- cluster(var, 5, dta)
# Calculate the distribution (counts)
cluster_counts <- table(clustered$cluster_52_frailty_bl)
# Calculate the shares (proportions)
cluster_shares <- prop.table(cluster_counts) * 100
custom_order <- c(1, 2, 4, 5, 3)
labels <- sprintf("Type %d\n(%.1f%%)", custom_order, cluster_shares)

clustered_out<-inner_join(dta, clustered, by = "rahhidpn") %>% select(rahhidpn,cluster_52_frailty_bl,Initial_Age,Final_Age)
dta_out <- inner_join(df, clustered_out, by = "rahhidpn")


df5 <- dta_out %>% filter(!is.na(cluster_52_frailty_bl)) %>% filter(agey_e>=Initial_Age) %>% 
  arrange(rahhidpn,year) %>% group_by(rahhidpn) %>%
  mutate(posit=row_number())


# Convert the 'category' variable to a factor
df5$srhs <- as.factor(df5$shlt)
#Excellent Health
df5$srhs_1 <- as.numeric(ifelse(df5$srhs==1 & !is.na(df5$srhs) ,1,
                                ifelse(df5$srhs!=1 & !is.na(df5$srhs),0,NA)))
#Very Good Health
df5$srhs_2 <- as.numeric(ifelse(df5$srhs==2 & !is.na(df5$srhs) ,1,
                                ifelse(df5$srhs!=2 & !is.na(df5$srhs),0,NA)))
#Good Health

df5$srhs_3 <- as.numeric(ifelse(df5$srhs==3 & !is.na(df5$srhs) ,1,
                                ifelse(df5$srhs!=3 & !is.na(df5$srhs),0,NA)))
#Fair Health

df5$srhs_4 <- as.numeric(ifelse(df5$srhs==4 & !is.na(df5$srhs) ,1,
                                ifelse(df5$srhs!=4 & !is.na(df5$srhs),0,NA)))
#Poor Health
df5$srhs_5 <- as.numeric(ifelse(df5$srhs==5 & !is.na(df5$srhs) ,1,
                                ifelse(df5$srhs!=5 & !is.na(df5$srhs),0,NA)))
#Death
df5$srhs_6 <- as.numeric(ifelse(df5$srhs==6 & !is.na(df5$srhs) ,1,
                                ifelse(df5$srhs!=6 & !is.na(df5$srhs),0,NA)))

#df5$srhs_6 <- as.numeric(ifelse(df5$Dead==1,1,df5$srhs_6))

df5 <- df5 %>% mutate(good_health= ifelse(srhs_1==1 | srhs_2==1 | srhs_3==1,1,0))



###
# options_for_plots
opt_size_grid<-0.5
opt_size_vline<-1.5
opt_heigth<-5.5
opt_width<-6.5
opt_dpi<-300

####
n_bar<-0

df<-df5
## Mean Frailty by age
plot_dta <- df %>%
  group_by(cluster_52_frailty_bl, posit) %>%
  summarise(temp = mean(frailty_bl, na.rm = TRUE),
            temp1 = mean(hitot, na.rm = TRUE),
            temp2 = mean(hatotb, na.rm = TRUE),
            temp3 = median(hitot, na.rm = TRUE),
            temp4 = median(hatotb, na.rm = TRUE),
            Age = 52 + (posit - 1) * 2,
            cluster = as.factor(cluster_52_frailty_bl),
            n_obs = sum(!is.na(frailty_bl))) %>% unique() %>% mutate(temp=ifelse(n_obs<n_bar,NA,temp))

ggplot(plot_dta, aes(x = Age, y = temp, linetype = cluster)) + 
  ylab("Frailty") + ylim(0, 1) + xlim(52, 78) + 
  geom_line(size = 1) + 
  scale_x_continuous(breaks = unique(plot_dta$Age),
                     limits = c(52,74)) +
  # coord_cartesian(xlim = c(52, 74)) +  # Ensure no padding around x-axis
  scale_linetype_manual(values = c("1" = "dotted", "2" = "dotdash", "3" = "dashed", "4" = "longdash", "5" = "solid")) +
  labs(linetype = "Cluster") + 
  theme_bw() +
  theme(
    axis.text = element_text(size = 14),
    axis.title = element_text(size = 16),
    legend.position = "none",  # Remove the legend
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 18),  # Increase the font size of the legend title
    panel.grid.major.x = element_line(color = "grey", size = opt_size_grid),  # Enable grid on x-axis
    panel.grid.minor.x = element_blank(),  # Disable minor grid lines on x-axis
    panel.grid.major.y = element_line(color = "grey", size = opt_size_grid),  # Enable grid on y-axis
    panel.grid.minor.y = element_blank()   # Disable minor grid lines on y-axis
  ) +
  geom_vline(xintercept = 64, linetype = "dashed", color = "red", size = opt_size_vline) +
  annotate("text", x = c(72, 72,72,54,57), y = c(0.15,0.37,0.65,0.85 ,0.75), 
           label = labels, 
           color = c("black", "black", "black", "black", "black"), 
           size = 5, fontface = "bold")

ggsave(
  filename = "output/Part2_output/Part2_b_graphs/fig15-1.pdf",
  width = opt_width,  # Adjust width as needed for your publication
  height = opt_heigth,   # Adjust height as needed for your publication
  dpi = opt_dpi,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
  device = cairo_pdf  # Use Cairo for advanced PDF output
)
# Conditional EPS export
if (eps == 1) {
  ggsave(
    filename = "output/Part2_output/Part2_b_graphs/fig15-1.eps",
    width = opt_width,  # Adjust width as needed for your publication
    height = opt_heigth,   # Adjust height as needed for your publication
    dpi = opt_dpi,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
    device = cairo_ps  # Use Cairo for advanced PDF output
  )
}

df<-df5
## Mean Frailty by age
plot_dta <- df %>% filter(frailty_bl<1) %>%
  group_by(cluster_52_frailty_bl, posit) %>%
  summarise(temp = mean(frailty_bl, na.rm = TRUE),
            Age = 52 + (posit - 1) * 2,
            cluster = as.factor(cluster_52_frailty_bl),
            n_obs = sum(!is.na(frailty_bl))) %>% unique() %>% mutate(temp=ifelse(n_obs<n_bar,NA,temp))

ggplot(plot_dta, aes(x = Age, y = temp, linetype = cluster)) + 
  ylab("Frailty") + ylim(0, 1) + xlim(52, 78) + 
  geom_line(size = 1) + 
  scale_x_continuous(breaks = unique(plot_dta$Age)) +
  scale_linetype_manual(values = c("1" = "dotted", "2" = "dotdash", "3" = "dashed", "4" = "longdash", "5" = "solid")) +
  labs(linetype = "Cluster") + 
  theme_bw() +
  theme(
    axis.text = element_text(size = 14),
    axis.title = element_text(size = 16),
    legend.position = "none",  # Remove the legend
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 18),  # Increase the font size of the legend title
    panel.grid.major.x = element_line(color = "grey", size = opt_size_grid),  # Enable grid on x-axis
    panel.grid.minor.x = element_blank(),  # Disable minor grid lines on x-axis
    panel.grid.major.y = element_line(color = "grey", size = opt_size_grid),  # Enable grid on y-axis
    panel.grid.minor.y = element_blank()   # Disable minor grid lines on y-axis
  ) +
  geom_vline(xintercept = 64, linetype = "dashed", color = "red", size = opt_size_vline) +
  annotate("text", x = c(72, 72,72,54,58), y = c(0.04,0.39,0.58,0.70 ,0.60), 
           label = labels, 
           color = c("black", "black", "black", "black", "black"), 
           size = 5, fontface = "bold")

ggsave(
  filename = "output/Part2_output/Part2_b_graphs/fig15-2.pdf",
  width = opt_width,  # Adjust width as needed for your publication
  height = opt_heigth,   # Adjust height as needed for your publication
  dpi = opt_dpi,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
  device = cairo_pdf  # Use Cairo for advanced PDF output
)
# Conditional EPS export
if (eps == 1) {
  ggsave(
    filename = "output/Part2_output/Part2_b_graphs/fig15-2.eps",
    width = opt_width,  # Adjust width as needed for your publication
    height = opt_heigth,   # Adjust height as needed for your publication
    dpi = opt_dpi,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
    device = cairo_ps  # Use Cairo for advanced PDF output
  )
}

## Fraction Alive by type
df<-df5
## Mean Frailty by age
plot_dta <- df %>% 
  group_by(cluster_52_frailty_bl,posit) %>%
  summarise(total_n = n(),
            total_n_with_data=sum(!is.na(Dead)),
            total_n_with_srhs=sum(!is.na(srhs)),
            total_alive=sum(Dead==0,na.rm = TRUE),
            total_alive_with_srhs=sum(!is.na(srhs) & Dead==0),
            fraction_alive = total_alive/total_n_with_data,
            fraction_dead = 1-fraction_alive,
            fraction_goodhealth = sum(good_health == 1 , na.rm = TRUE)/total_alive_with_srhs,
            fraction_badhealth = 1-fraction_goodhealth,
            Age = 52 + (posit - 1) * 2,
            cluster = as.factor(cluster_52_frailty_bl)) %>% unique() %>% mutate(fraction_goodhealth=ifelse(total_alive_with_srhs<n_bar,NA,fraction_goodhealth),
                                                                                fraction_badhealth=ifelse(total_alive_with_srhs<n_bar,NA,fraction_badhealth))

ggplot(plot_dta, aes(x = Age, y = fraction_dead, linetype = cluster)) + 
  ylab("Fraction dead") + ylim(0, 1) + xlim(52, 78) + 
  geom_line(size = 1) + 
  scale_x_continuous(breaks = unique(plot_dta$Age)) +
  scale_linetype_manual(values = c("1" = "dotted", "2" = "dotdash", "3" = "dashed", "4" = "longdash", "5" = "solid")) +
  labs(linetype = "Cluster") + 
  theme_bw() +
  theme(
    axis.text = element_text(size = 14),
    axis.title = element_text(size = 16),
    legend.position = "none",  # Remove the legend
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 18),  # Increase the font size of the legend title
    panel.grid.major.x = element_line(color = "grey", size = opt_size_grid),  # Enable grid on x-axis
    panel.grid.minor.x = element_blank(),  # Disable minor grid lines on x-axis
    panel.grid.major.y = element_line(color = "grey", size = opt_size_grid),  # Enable grid on y-axis
    panel.grid.minor.y = element_blank()   # Disable minor grid lines on y-axis
  ) +
  geom_vline(xintercept = 64, linetype = "dashed", color = "red", size = opt_size_vline) +
  annotate("text", x = c(73, 73,73,54,57), y = c(0.03,0.32,0.55,0.65,0.55), 
           label = labels, 
           color = c("black", "black", "black", "black", "black"), 
           size = 5, fontface = "bold")

ggsave(
  filename = "output/Part2_output/Part2_b_graphs/fig15-3.pdf",
  width = opt_width,  # Adjust width as needed for your publication
  height = opt_heigth,   # Adjust height as needed for your publication
  dpi = opt_dpi,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
  device = cairo_pdf  # Use Cairo for advanced PDF output
)
# Conditional EPS export
if (eps == 1) {
  ggsave(
    filename = "output/Part2_output/Part2_b_graphs/fig15-3.eps",
    width = opt_width,  # Adjust width as needed for your publication
    height = opt_heigth,   # Adjust height as needed for your publication
    dpi = opt_dpi,    # Optional: Specify DPI for rasterized layers, though unnecessary for PDF
    device = cairo_ps  # Use Cairo for advanced PDF output
  )
}


end_time <- Sys.time()
runtime <- end_time-start_time
print(runtime)

rm(list = setdiff(ls(),c(to_keep)))