
setwd(main_path)

# Import relevant libraries
library(openxlsx)
library(nnet) 
library(kableExtra)

#Store Environment
to_keep <- ls()
start_time <- Sys.time()


df_imputed<- df_proc 

# STEP 1: Keep with individuals with enough information
setwd(code_path_p3)
source("Part3_b_and_c_get_init_distribution.R")



################################################################################
############# Testing different specification for health dynamics ##############
################################################################################

set.seed(123)
#Randomly shuffle the data
yourData<-Data_to_draw_from[sample(nrow(Data_to_draw_from)),]

#Create 10 equally size folds
folds <- cut(seq(1,nrow(yourData)),breaks=10,labels=FALSE)


# Define accuracy
calculate_accuracy <- function(model, train, test) {
  
  # Predicting the values for train dataset
  train$ClassPredicted <- predict(model, newdata = train, "class")
  
  # Building classification table
  tab_train <- table(train$lead_srhs, train$ClassPredicted)
  # Calculating accuracy - sum of diagonal elements divided by total obs
  a <- round((sum(diag(tab_train)) / sum(tab_train)) * 100, 5)
  
  
  #####            Predicting the values for test dataset   #############################
  test$ClassPredicted <- predict(model, newdata = test, "class")
  
  tab_test_acc <- table(test$lead_srhs, test$ClassPredicted)
  # Calculating accuracy - sum of diagonal elements divided by total obs
  v_acc <- round((sum(diag(tab_test_acc)) / sum(tab_test_acc)) * 100, 5)
  
  prob_predict <- predict(model, newdata = test, "prob")
  
  
  e_p <- round(prob_predict,10)
  e_n <- test %>% ungroup() %>% select(rahhidpn,lead_srhs)
  e_1 <- cbind(e_n,e_p)
  ## Expand data by 1000
  names(e_1) <- c("rahhidpn","lead_srhs","srhs_1","srhs_2","srhs_3","srhs_4","srhs_5","srhs_6")
  
  
  #Compute thresholds
  #Naive =0 means types are  distribute as in the data
  thresholds<- e_1 %>% mutate(interval_1=srhs_1,
                              interval_2=interval_1 + srhs_2,
                              interval_3=interval_2 + srhs_3,
                              interval_4=interval_3 + srhs_4,
                              interval_5=interval_4 + srhs_5,
                              interval_6=interval_5 + srhs_6)
  
  N <- 100
  # Expand e_1 by repeating each row N times
  thresholds <- thresholds %>%
    slice(rep(row_number(), each = N))
  
  
  #Generate random states
  thresholds$random_state<-runif(length(thresholds$rahhidpn))
  
  
  # Assigment
  thresholds <- thresholds %>% 
    mutate(Srhs_1 = ifelse(random_state<=interval_1,1,0),
           Srhs_2 = ifelse(random_state<=interval_2 & random_state>interval_1,1,0),
           Srhs_3 = ifelse(random_state<=interval_3 & random_state>interval_2,1,0),
           Srhs_4 = ifelse(random_state<=interval_4 & random_state>interval_3,1,0),
           Srhs_5 = ifelse(random_state<=interval_5 & random_state>interval_4,1,0),
           Srhs_6 = ifelse(random_state<=interval_6 & random_state>interval_5,1,0),
           class_predicted = ifelse(Srhs_1==1,1,
                                    ifelse(Srhs_2==1,2,
                                           ifelse(Srhs_3==1,3,
                                                  ifelse(Srhs_4==1,4,
                                                         ifelse(Srhs_5==1,5,
                                                                ifelse(Srhs_6==1,6,NA)))))))
  
  final_data<-thresholds %>% select("rahhidpn","lead_srhs","class_predicted") 
  
  # Building classification table
  tab_test <- table(final_data$lead_srhs, final_data$class_predicted)
  # Calculating accuracy - sum of diagonal elements divided by total obs
  v <- round((sum(diag(tab_test)) / sum(tab_test)) * 100, 5)
  
  
  return(list(a = a, v = v, v_acc=v_acc))
}


#Perform 10 fold cross validation

formula_1 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5 
                          + educ_cat_2 + educ_cat_3 + mstatusd)*(factor(genderd))


formula_2 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5 
                          + educ_cat_2 + educ_cat_3 + mstatusd)*(factor(genderd))  + factor(cluster_52_frailty_bl)

formula_3 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5) + factor(cluster_52_frailty_bl)

formula_4 <- lead_srhs ~ (srhs_2 + srhs_3 + srhs_4 + srhs_5) + factor(cluster_52_frailty_bl)

formula_5 <- lead_srhs ~ (srhs_2 + srhs_3 + srhs_4 + srhs_5)

formula_6 <- lead_srhs ~ 1

#Number of models
pp<-6

# Initialize matrices to store results
n_obs_matrix <- matrix(NA, nrow = 1, ncol = pp)
log_likelihood_matrix <- matrix(NA, nrow = 1, ncol = pp)
pseudo_r2_matrix <- matrix(NA, nrow = 1, ncol = pp)
aic_matrix <- matrix(NA, nrow = 1, ncol = pp)
bic_matrix <- matrix(NA, nrow = 1, ncol = pp)

accuracy_train<- matrix(NA, nrow = 1, ncol = pp)
accuracy_test_acc<- matrix(NA, nrow = 1, ncol = pp)
accuracy_test<- matrix(NA, nrow = 1, ncol = pp)

Formulas <- vector("list", length = pp)

model_data <- subset(df_imputed, Death == 0 & rahhidpn %in% insample)

# Fit the models and store results
for (i in 1:pp) {
  # Fit the model
  formula_to_run <- get(paste0("formula_", i))
  k<-i
  
  model_full <- multinom(formula_to_run,model=TRUE, data = model_data, maxit = 5000)
  
  # Number of observations
  n_obs <- nrow(model_full$fitted.values)
  n_obs_matrix[1, k] <- n_obs
  
  # Log-likelihood
  log_likelihood <- logLik(model_full)
  log_likelihood_matrix[1, k] <- log_likelihood
  
  # Pseudo R-squared (McFadden's R-squared)
  PseudoR2_model <- PseudoR2(model_full, which = NULL)
  pseudo_r2_matrix[1, k] <- PseudoR2_model
  
  # AIC
  aic_value <- AIC(model_full)
  aic_matrix[1, k] <- aic_value
  
  # BIC
  bic_value <- BIC(model_full)
  bic_matrix[1, k] <- bic_value
  
  
  ## Estimate Accuracy
  accuracy_train_10_fold <- matrix(NA, nrow = 1, ncol = 10)
  accuracy_test_10_fold <- matrix(NA, nrow = 1, ncol = 10)
  accuracy_test_10_fold_acc <- matrix(NA, nrow = 1, ncol = 10)
  
  
  for(j in 1:10){
    #Segement your data by fold using the which() function 
    testIndexes <- which(folds==j,arr.ind=TRUE)
    id_testData <- yourData[testIndexes, ] %>% select(rahhidpn)
    id_trainData <- yourData[-testIndexes, ] %>% select(rahhidpn)
    
    
    testData<- subset(df_imputed, Death == 0 & rahhidpn %in% id_testData$rahhidpn)
    trainData<- subset(df_imputed, Death == 0 & rahhidpn %in% id_trainData$rahhidpn)   
    #Use the test and train data partitions however you desire...
    
    model_for_accuracy <- multinom(formula_to_run,model=TRUE, data = trainData, maxit = 5000)
    
    result <- calculate_accuracy(model_for_accuracy, trainData, testData)
    
    accuracy_train_10_fold[1,j]<- result$a
    accuracy_test_10_fold[1,j]<- result$v
    accuracy_test_10_fold_acc[1,j]<- result$v_acc
    
  }
  
  
  accuracy_train[1,k]<- mean(accuracy_train_10_fold)
  accuracy_test[1,k]<- mean(accuracy_test_10_fold)
  accuracy_test_acc[1,k]<- mean(accuracy_test_10_fold_acc)
  
  Formulas[[k]] <- formula_to_run
  
}


# Transpose matrices
n_obs_matrix <- t(n_obs_matrix)
log_likelihood_matrix <- t(log_likelihood_matrix)
pseudo_r2_matrix <- t(pseudo_r2_matrix)
aic_matrix <- t(aic_matrix)
bic_matrix <- t(bic_matrix)
accuracy_train <- t(accuracy_train)
accuracy_test <- t(accuracy_test)

accuracy_test_acc <- t(accuracy_test_acc)


formulas_as_strings <- sapply(Formulas, as.character)

# Create a dataframe with formulas
formulas_df <- data.frame(Formulas = formulas_as_strings)
formulas_df <- formulas_df[3,] 
formulas_df<-t(formulas_df)

# Combine results into a dataframe
results_df <- data.frame(
  Model = paste0("Model ", 1:pp),
  Observations = n_obs_matrix[, 1],
  LogLikelihood = log_likelihood_matrix[, 1],
  PseudoR2 = pseudo_r2_matrix[, 1],
  AIC = aic_matrix[, 1],
  BIC = bic_matrix[, 1],
  Acc_train = accuracy_train[,1],
  Acc_test_acc = accuracy_test_acc[,1],
  Acc_test = accuracy_test[,1],
  Formulas = formulas_df[,1]
)

# Print the results
print(results_df)

# Define file paths
file_path_excel <- "output/Part3_output/Part3_b_testing/model_performances.xlsx"
file_path_csv <- "output/Part3_output/Part3_b_testing/model_performances.csv"

# Write dataframe to Excel file
write.xlsx(results_df, file_path_excel, rowNames = FALSE)

# Write dataframe to CSV file
write.csv(results_df, file_path_csv, row.names = FALSE)


################################################################################
# Table 6 
################################################################################
formula1 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5 
                         + educ_cat_2 + educ_cat_3 + mstatusd)*(factor(genderd))


formula2 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5 
                         + educ_cat_2 + educ_cat_3 + mstatusd)*(factor(genderd))  + factor(cluster_52_frailty_bl)

model_data <- subset(df_imputed, Death == 0 & rahhidpn %in% insample)
# Fit the models
model1 <- multinom(formula1, data = model_data, maxit = 5000)
model2 <- multinom(formula2, data = model_data, maxit = 5000)

# Extract metrics
logLik1 <- logLik(model1)
logLik2 <- logLik(model2)
aic1 <- AIC(model1)
aic2 <- AIC(model2)
bic1 <- BIC(model1)
bic2 <- BIC(model2)

# Calculate pseudo-R^2
pseudo_r2_model1 <- pR2(model1)["McFadden"]
pseudo_r2_model2 <- pR2(model2)["McFadden"]

# Log-likelihood ratio test p-value
lr_test <- lrtest(model1, model2)

# Create the data for the table
table_data <- data.frame(
  Metric = c(
    "Log-likelihood", 
    "Log-likelihood ratio test p-value", 
    "Akaike information criterion", 
    "Bayesian information criterion", 
    "Pseudo-$R^2$"
  ),
  Without_Health_Types = c(
    format(round(as.numeric(logLik1), 0), nsmall = 0),  # Format without decimals
    "-", 
    format(round(aic1, 0), nsmall = 0), 
    format(round(bic1, 0), nsmall = 0), 
    format(round(as.numeric(pseudo_r2_model1), 3), nsmall = 3)  # Keep 3 decimals
  ),
  Including_Health_Types = c(
    format(round(as.numeric(logLik2), 0), nsmall = 0),  # Format without decimals
    format(as.numeric(lr_test[2,5]), nsmall = 2),       # Ensure p-value has 3 decimals
    format(round(aic2, 0), nsmall = 0), 
    format(round(bic2, 0), nsmall = 0), 
    format(round(as.numeric(pseudo_r2_model2), 3), nsmall = 3)  # Keep 3 decimals
  )
)

# Create the LaTeX table
latex_table <- kable(
  table_data, 
  format = "latex", 
  booktabs = TRUE, 
  escape = FALSE,
  align = c("l", "c", "c"),  # Align first column left, others center
  col.names = c("", "Without health types", "Including health types")
) %>%
  add_header_above(c(" " = 1, "Future SRHS" = 2)) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE)


# Save the table to a file
writeLines(latex_table, "output/Part3_output/Part3_b_testing/tab6.tex")


################################################################################
# Table 7
################################################################################
formula1 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5 
                         + educ_cat_2 + educ_cat_3 + mstatusd)*(factor(genderd))


formula2 <- lead_srhs ~ (age_1 + age_2  + srhs_2 + srhs_3 + srhs_4 + srhs_5)  + factor(cluster_52_frailty_bl)

model_data <- subset(df_imputed, Death == 0 & rahhidpn %in% insample)
# Fit the models
model1 <- multinom(formula1, data = model_data, maxit = 5000)
model2 <- multinom(formula2, data = model_data, maxit = 5000)

# Extract metrics
logLik1 <- logLik(model1)
logLik2 <- logLik(model2)
aic1 <- AIC(model1)
aic2 <- AIC(model2)
bic1 <- BIC(model1)
bic2 <- BIC(model2)

# Calculate pseudo-R^2
pseudo_r2_model1 <- pR2(model1)["McFadden"]
pseudo_r2_model2 <- pR2(model2)["McFadden"]

# Log-likelihood ratio test p-value
lr_test <- lrtest(model1, model2)

# Create the data for the table
table_data <- data.frame(
  Metric = c(
    "Log-likelihood", 
    "Log-likelihood ratio test p-value", 
    "Akaike information criterion", 
    "Bayesian information criterion", 
    "Pseudo-$R^2$"
  ),
  Without_Health_Types = c(
    format(round(as.numeric(logLik1), 0), nsmall = 0),  # Format without decimals
    "-", 
    format(round(aic1, 0), nsmall = 0), 
    format(round(bic1, 0), nsmall = 0), 
    format(round(as.numeric(pseudo_r2_model1), 3), nsmall = 3)  # Keep 3 decimals
  ),
  Including_Health_Types = c(
    format(round(as.numeric(logLik2), 0), nsmall = 0),  # Format without decimals
    format(as.numeric(lr_test[2,5]), nsmall = 2),       # Ensure p-value has 3 decimals
    format(round(aic2, 0), nsmall = 0), 
    format(round(bic2, 0), nsmall = 0), 
    format(round(as.numeric(pseudo_r2_model2), 3), nsmall = 3)  # Keep 3 decimals
  )
)

# Create the LaTeX table
latex_table <- kable(
  table_data, 
  format = "latex", 
  booktabs = TRUE, 
  escape = FALSE,
  align = c("l", "c", "c"),  # Align first column left, others center
  col.names = c("", "Without health types", "Including health types")
) %>%
  add_header_above(c(" " = 1, "Future SRHS" = 2)) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE)

# Save the table to a file
writeLines(latex_table, "output/Part3_output/Part3_b_testing/tab7.tex")



end_time <- Sys.time()
runtime <- end_time-start_time
print(runtime)

############################################################

#Clear enviroment
rm(list = setdiff(ls(), c(to_keep)))


