
setwd(main_path)


### Import relevant libraries
library(tidyverse)
library(haven)
library(stats)
library(gcookbook)
library(ggplot2)
library(plot.matrix)
library(imguR)
library(directlabels)
library(ggrepel)
library(openxlsx)


#Store Environment
to_keep <- ls()
start_time <- Sys.time()
file_name<- c('dtafiles/P52_5_Clusters.dta')


df5 <- read_dta(file_name) %>% filter(!is.na(cluster_52_frailty_bl)) %>% filter(agey_e>=Initial_Age) %>% 
  arrange(rahhidpn,year) %>% group_by(rahhidpn) %>%
  mutate(posit=row_number())


# Convert the 'category' variable to a factor
df5$srhs <- as.factor(df5$shlt)
#Excellent Health
df5$srhs_1 <- as.numeric(ifelse(df5$srhs==1 & !is.na(df5$srhs) ,1,
                               ifelse(df5$srhs!=1 & !is.na(df5$srhs),0,NA)))
#Very Good Health
df5$srhs_2 <- as.numeric(ifelse(df5$srhs==2 & !is.na(df5$srhs) ,1,
                               ifelse(df5$srhs!=2 & !is.na(df5$srhs),0,NA)))
#Good Health

df5$srhs_3 <- as.numeric(ifelse(df5$srhs==3 & !is.na(df5$srhs) ,1,
                               ifelse(df5$srhs!=3 & !is.na(df5$srhs),0,NA)))
#Fair Health

df5$srhs_4 <- as.numeric(ifelse(df5$srhs==4 & !is.na(df5$srhs) ,1,
                               ifelse(df5$srhs!=4 & !is.na(df5$srhs),0,NA)))
#Poor Health
df5$srhs_5 <- as.numeric(ifelse(df5$srhs==5 & !is.na(df5$srhs) ,1,
                               ifelse(df5$srhs!=5 & !is.na(df5$srhs),0,NA)))
#Death
df5$srhs_6 <- as.numeric(ifelse(df5$srhs==6 & !is.na(df5$srhs) ,1,
                               ifelse(df5$srhs!=6 & !is.na(df5$srhs),0,NA)))

#df5$srhs_6 <- as.numeric(ifelse(df5$Dead==1,1,df5$srhs_6))

df5 <- df5 %>% mutate(good_health= ifelse(srhs_1==1 | srhs_2==1 | srhs_3==1,1,0))


cause_of_death<- read_dta("dtafiles/DeathCause.dta") %>% mutate(rahhidpn = as.numeric(paste0(hhid, pn)))  %>% 
  select(-hhid, -pn)

df5 <- left_join(df5, cause_of_death, by = c("rahhidpn"))

df5<-df5 %>% arrange(rahhidpn, year) %>% group_by(rahhidpn) %>%
            mutate(   has_died=max(Dead,na.rm = TRUE),
                      dead_clustering = ifelse(Dead==1,
                                                      ifelse(agey_e<=Final_Age,1,0),
                                              NA),
                      has_died_clustering=ifelse(has_died==1,
                                                 max(dead_clustering, na.rm=TRUE),NA))  

column_names <- colnames(cause_of_death)


df_final<- df5 %>% select(rahhidpn,cluster_52_frailty_bl,has_died,has_died_clustering,all_of(column_names)) %>% unique()


df_summary <- df_final %>%
  group_by(cluster_52_frailty_bl,has_died,has_died_clustering) %>%
  summarize(
    num_observations = n(),                                     # Number of observations
    num_non_missing_age = sum(!is.na(rascod)),                      # Number of observations with non-missing Exit interview
    cancer = sum(rascod == 1,na.rm=TRUE),          # Fraction of observations with categ = 1
    skin = sum(rascod == 2,na.rm=TRUE),
    musco= sum(rascod == 3,na.rm=TRUE),
    heart= sum(rascod == 4,na.rm=TRUE),
    alleg= sum(rascod == 5,na.rm=TRUE),
    endocrin= sum(rascod == 6,na.rm=TRUE),
    digestive= sum(rascod == 7,na.rm=TRUE),
    neurologic= sum(rascod == 8,na.rm=TRUE),
    reprod= sum(rascod == 9,na.rm=TRUE),
    emotional= sum(rascod == 10,na.rm=TRUE),
    other_systoms_micellaneous_other_healthcond = sum(rascod == 11|rascod == 12 | rascod == 13,na.rm=TRUE),
    not_health_condition = sum(rascod == 14,na.rm=TRUE),
    missing= sum(rascod == 15,na.rm=TRUE),
    expected=sum(radexpec == 1,na.rm=TRUE),
    unexpected=sum(radexpec == 2,na.rm=TRUE),
    other=sum(radexpec == 3,na.rm=TRUE)
  )


# Define file names for Excel and CSV
excel_file <- "output/Part2_output/Part2_h_cause_of_death/Cause_of_death.xlsx"
csv_file <- "output/Part2_output/Part2_h_cause_of_death/Cause_of_death.csv"

# Write the data frame to an Excel file
write.xlsx(df_summary, excel_file, row.names = FALSE)

# Write the data frame to a CSV file
write.csv(df_summary, csv_file, row.names = FALSE)

end_time <- Sys.time()
runtime <- end_time-start_time
print(runtime)

#Clear enviroment
rm(list = setdiff(ls(), c(to_keep)))

