
# STEP 1: Create the Initial Distribution at age 52-53
Data_to_draw_from <- df_imputed %>% arrange(rahhidpn, agey_e) %>%
  group_by(rahhidpn) %>%
  mutate(n_obs = row_number(),
         out_init_dist=ifelse(is.na(PI) |is.na(mstatusd)  | is.na(genderd)| is.na(shlt) | is.na(cluster_52_frailty_bl) | is.na(educ_cat) ,1,0),
         m_1 = mstatusd,
         m_2= lead(mstatusd,1),
         m_3= lead(mstatusd,2),
         m_4= lead(mstatusd,3),
         m_5= lead(mstatusd,4),
         m_6= lead(mstatusd,5),
         m_7= lead(mstatusd,6),
         m_8= lead(mstatusd,7),
         m_9= lead(mstatusd,8),
         m_10= lead(mstatusd,9),
         m_11= lead(mstatusd,10),
         m_12= lead(mstatusd,12),
         d_srhs_1=srhs,
         d_srhs_2= lead(srhs,1),
         d_srhs_3= lead(srhs,2),
         d_srhs_4= lead(srhs,3),
         d_srhs_5= lead(srhs,4),
         d_srhs_6= lead(srhs,5),
         frailty_1=frailty_bl,
         frailty_2= lead(frailty_bl,1),
         frailty_3= lead(frailty_bl,2),
         frailty_4= lead(frailty_bl,3),
         frailty_5= lead(frailty_bl,4),
         frailty_6= lead(frailty_bl,5)) %>%
  filter(n_obs==1) %>% 
  mutate(out_init_dist2=ifelse(Initial_Age>53 | out_init_dist==1,1,0)) %>% ungroup()

# Impute Marital status if missing
Data_to_draw_from <- Data_to_draw_from %>% mutate( m_2 = ifelse(is.na(m_2),m_1,m_2),
                                                   m_3 = ifelse(is.na(m_3),m_2,m_3),
                                                   m_4 = ifelse(is.na(m_4),m_3,m_2),
                                                   m_5 = ifelse(is.na(m_5),m_4,m_5),
                                                   m_6 = ifelse(is.na(m_6),m_5,m_6),
                                                   m_7 = ifelse(is.na(m_7),m_6,m_7),
                                                   m_8 = ifelse(is.na(m_8),m_7,m_8),
                                                   m_9 = ifelse(is.na(m_9),m_8,m_9),
                                                   m_10 = ifelse(is.na(m_10),m_9,m_10),
                                                   m_11 = ifelse(is.na(m_11),m_10,m_11),
                                                   m_12 = ifelse(is.na(m_12),m_11,m_12))

missing_summary <- Data_to_draw_from %>%
  mutate(Missing_count_any = ifelse(is.na(PI) |is.na(mstatusd)  | is.na(genderd)| is.na(srhs) | is.na(cluster_52_frailty_bl) | is.na(educ_cat) ,1,0)) %>%
  summarise(missing_count_PI = sum(is.na(PI)),
            missing_count_gender = sum(is.na(genderd)),
            missing_count_MS = sum(is.na(mstatusd)),
            missing_count_HS = sum(is.na(srhs)),
            missing_count_educ = sum(is.na(educ_cat)),
            missing_count_Total = sum(Missing_count_any),
            check_sum=sum(out_init_dist),
            check_sum2=sum(out_init_dist2),
            total_obs=sum(n_obs))


varlist_t_k<-c('rahhidpn', 'agey_e', 'shlt', 'cluster_52_frailty_bl', 'genderd', 'mstatusd', 'PI', 'srhs_1', 'srhs_2', 'srhs_3', 'srhs_4', 'srhs_5', 'srhs_6','educ_cat_1','educ_cat_2','educ_cat_3')
varlist_t_k_2<- c('m_1','m_2','m_3','m_4','m_5','m_6','m_7','m_8','m_9','m_10','m_11','m_12')
varlist_t_k_3<- c('d_srhs_1','d_srhs_2','d_srhs_3','d_srhs_4','d_srhs_5','d_srhs_6',
                  'frailty_1','frailty_2','frailty_3','frailty_4','frailty_5','frailty_6')

init_distribution<-Data_to_draw_from %>% filter(out_init_dist2==0) %>% select(all_of(varlist_t_k),all_of(varlist_t_k_2),all_of(varlist_t_k_3)) 

#Create ID with enough information for simulation and estimation
insample<-init_distribution$rahhidpn 

setwd(main_path)
