Paso 2.4

Análisis de clases latentes exploratoria y comparativa con predictores (glca)

Andrés González Santa Cruz
April 28, 2023
Show code
script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
Show code
 $(document).ready(function() {
    $('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
    // onClick function for all plots (img's)
    $('img:not(.zoomImg)').click(function() {
      $('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
      $('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
    });
    // onClick function for zoomImg
    $('img.zoomImg').click(function() {
      $('.zoomDiv').css({opacity: '0', width: '0%'}); 
    });
  });
  
Show code
<script src="hideOutput.js"></script> 
Show code
$(document).ready(function() {    
    $chunks = $('.fold');    
    $chunks.each(function () {      // add button to source code chunks     
    if ( $(this).hasClass('s') ) {       
        $('pre.r', this).prepend("<div class=\"showopt\">Show Source</div><br style=\"line-height:22px;\"/>");
            $('pre.r', this).children('code').attr('class', 'folded');     
            }      // add button to output chunks     
        if ( $(this).hasClass('o') ) {       
            $('pre:not(.r)', this).has('code').prepend("<div class=\"showopt\">Show Output</div><br style=\"line-height:22px;\"/>");       
            $('pre:not(.r)', this).children('code:not(r)').addClass('folded');        // add button to plots       
            $(this).find('img').wrap('<pre class=\"plot\"></pre>');       
            $('pre.plot', this).prepend("<div class=\"showopt\">Show Plot</div><br style=\"line-height:22px;\"/>");       
            $('pre.plot', this).children('img').addClass('folded');      
            }   
});    // hide all chunks when document is loaded   
    $('.folded').css('display', 'none')    // function to toggle the visibility   
    $('.showopt').click(function() {     
            var label = $(this).html();     
            if (label.indexOf("Show") >= 0) {       
                $(this).html(label.replace("Show", "Hide"));     
            } else {
              $(this).html(label.replace("Hide", "Show"));     
            }     
    $(this).siblings('code, img').slideToggle('fast', 'swing');   
    }); 
}); 

Cargamos los datos

Show code
rm(list = ls());gc()
         used (Mb) gc trigger (Mb) max used (Mb)
Ncells 535731 28.7    1208804 64.6   643711 34.4
Vcells 906385  7.0    8388608 64.0  1649728 12.6
Show code
load("data2_lca2_2023_04_26.RData")

Cargamos los paquetes

Show code
knitr::opts_chunk$set(echo = TRUE)

if(!require(poLCA)){install.packages("poLCA")}
if(!require(poLCAParallel)){devtools::install_github("QMUL/poLCAParallel@package")}
if(!require(compareGroups)){install.packages("compareGroups")}
if(!require(parallel)){install.packages("parallel")}
if(!require(Hmisc)){install.packages("Hmisc")}
if(!require(tidyverse)){install.packages("tidyverse")}
try(if(!require(sjPlot)){install.packages("sjPlot")})
if(!require(emmeans)){install.packages("emmeans")}
if(!require(nnet)){install.packages("nnet")}
if(!require(here)){install.packages("here")}
if(!require(doParallel)){install.packages("doParallel")}
if(!require(progress)){install.packages("progress")}
if(!require(caret)){install.packages("caret")}
if(!require(rpart)){install.packages("rpart")}
if(!require(rpart.plot)){install.packages("rpart.plot")}
if(!require(partykit)){install.packages("partykit")}
if(!require(randomForest)){install.packages("randomForest")}
if(!require(ggcorrplot)){install.packages("ggcorrplot")}
if(!require(polycor)){install.packages("polycor")}
if(!require(tableone)){install.packages("tableone")}
if(!require(broom)){install.packages("broom")}
if(!require(plotly)){install.packages("plotly")}
if(!require(rsvg)){install.packages("rsvg")}
if(!require(DiagrammeRsvg)){install.packages("DiagrammeRsvg")}
if(!require(glca)){install.packages("glca")}
#if(!require(poLCA)){githubinstall::gh_install_packages("poLCA", ref = github_pull("14"))}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

Definimos ciertas constantes

Show code
clus_iter= 500 #500
n_thread <- parallel::detectCores()
nrep <- clus_iter # number of different initial values (could be n_thread too)
n_class_max <- 10 # maximum number of classes to investigate
n_bootstrap <- 100 #30 # 50 number of bootstrap samples
print(n_thread)
[1] 8

Análisis de clases latentes

Especificar modelo

Show code
library(DiagrammeR) #⋉
gr_lca4<-
DiagrammeR::grViz([1352 chars quoted with '"'])#, width = 1200, height = 900
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3733703/
#Cohort matching on a variable associated with both outcome and censoring
#Cohort matching on a confounder. We let A denote an exposure, Y denote an outcome, and C denote a confounder and matching variable. The variable S indicates whether an individual in the source population is selected for the matched study (1: selected, 0: not selected). See Section 2-7 for details.
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7064555/
DPI = 1200
WidthCM = 21
HeightCM = 8

gr_lca4 %>%
  export_svg %>% charToRaw %>% rsvg_pdf("_flowchart_lca_adj._exp2pdf")

gr_lca4 %>% export_svg()%>%charToRaw %>% rsvg(width = WidthCM *(DPI/2.54), height = HeightCM *(DPI/2.54)) %>% png::writePNG("_flowchart_lca0_adj_exp2.png")

htmlwidgets::saveWidget(gr_lca4, "_flowchart_lca_adj_exp2.html")
webshot::webshot("_flowchart_lca_adj_exp2.html", "_flowchart_lca_adj_exp2.png",vwidth = 1200, vheight = 900,
        zoom = 2)
Gráfico esquemático

Figure 1: Gráfico esquemático

Los valores ceros se declararon datos perdidos.

Show code
f_adj<- item(CAUSAL, EDAD_MUJER_REC, PUEBLO_ORIGINARIO_REC, PAIS_ORIGEN_REC,HITO1_EDAD_GEST_SEM_REC, MACROZONA, PREV_TRAMO_REC) ~ outcome

Análisis de clases latentes

Show code
#Biemer, P. P., & Wiesen, C. (2002). Measurement error evaluation of self-reported drug use: a latent class analysis of the US National Household Survey on Drug Abuse. Journal of the Royal Statistical Society: Series A (Statistics in Society), 165(1), 97–119. doi:10.1111/1467-985x.00612  
#lca_entropia(x="ppio", seed= 2125, k= 8, f= f_preds, dat= mydata_preds, nbr_repet= 30, na_rm= T)
#3
#<div style="border: 1px solid #ddd; padding: 5px; overflow-y: scroll; height:400px; overflow-x: scroll; width:100%">
# f is the selected variables
# dat is the data
# nb_var is the number of selected variables
# k is the number of latent class generated
# nbr_repet is the number of repetition to  
# reach the convergence of EM algorithm
# x es el código para las variables de los modelos
#seed es el numero random para las semillas. ej: 4345.
#Modo de calcular el mejor modelo.
#z_ # 
#2023-01-20
#https://github.com/QMUL/poLCAParallel/blob/master/exec/3_blrt.R
#0h s
seed<-2125
old <- Sys.time()

lca22 <- glca(f_adj, data = mydata_preds3, nclass = 2, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca23 <- glca(f_adj, data = mydata_preds3, nclass = 3, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca24 <- glca(f_adj, data = mydata_preds3, nclass = 4, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca25 <- glca(f_adj, data = mydata_preds3, nclass = 5, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca26 <- glca(f_adj, data = mydata_preds3, nclass = 6, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca27 <- glca(f_adj, data = mydata_preds3, nclass = 7, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca28 <- glca(f_adj, data = mydata_preds3, nclass = 8, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca29 <- glca(f_adj, data = mydata_preds3, nclass = 9, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)
lca210 <- glca(f_adj, data = mydata_preds3, nclass = 10, seed = seed, verbose = FALSE, n.init = 5e2, decreasing=T, maxiter = 1e4,testiter = 500)

new_med<-(Sys.time())
paste0("The model took ",round(new_med-old,2)," until every LCA was computed")
[1] "The model took 1.55 until every LCA was computed"

Luego calculamos la razón de verosimilitud mediante remuestreo bootstrap (BLRT) entre los distintos modelos con el que asume una clase menos.

Show code
gof2<-
gofglca(lca22, lca23, lca24, lca25, lca26, lca27, lca28, lca29, lca210, test = "chisq")

bootlrt2<-
gofglca(lca22, lca23, lca24, lca25, lca26, lca27, lca28, lca29, lca210, test = "boot", nboot=n_bootstrap, seed=2125)

Resultados

Hicimos un gráfico de los resultados

Show code
manualcolors <- c('indianred1', 'cornflowerblue', 'gray50', 'darkolivegreen4', 'slateblue2', 
                  'firebrick4', 'goldenrod4')
levels4 <- c("logLik", "Gsq", "AIC", "CAIC", "BIC", "entropy", "Res.Df")
labels4 <- c('Log-Verosimilitud', 'Chi2', 'Criterio de Información\nde Akaike(AIC)'','AIC Corregido'','Criterio de Información\nBayesiano (BIC)')''Entropía'­a'Grados de libertad residuales')s')
fig_lca_fit4<- cbind.data.frame(rn=2:10,gof2$gtable) %>%
  data.frame() %>% 
  dplyr::mutate_if(is.character, as.numeric) %>%  # convert character columns to numeric
  tidyr::pivot_longer(cols = -rn,names_to = "indices", values_to = "value", values_drop_na = F) %>%
  dplyr::mutate(indices = factor(indices, levels = levels4, labels = labels4)) %>%
  dplyr::filter(grepl("(AIC|BIC)",indices, ignore.case=T))%>%
  dplyr::mutate(ModelIndex= factor(rn, levels=2:10)) %>% 
  ggplot(aes(x = ModelIndex, y = value, group = indices, color = indices, linetype = indices)) +
  geom_line(size = 1.5) +
  scale_color_manual(values = manualcolors) +
  #scale_linetype_manual(values = c("solid", "dashed", "dotted")) +
  labs(x = "Número de clases"", y=="Valor"", color=="Medida"", linetype=="Medida"))++
  #facet_wrap(.~indices, scales = "free_y", nrow = 4, ncol = 1) +
  theme_bw()

fig_lca_fit4
Show code
ggsave("_fig1_comparison_glca.png",fig_lca_fit4, dpi=600)
#International Journal of Workplace Health Management  (Zhang et al., 2018).

Luego en una tabla

Show code
cbind.data.frame(rn=2:10,gof2$gtable) %>%#
  dplyr::select(rn, everything()) %>% 
    dplyr::mutate_if(is.character, as.numeric) %>%  # convert character columns to numeric
    knitr::kable(format="markdown", caption="Índices de ajuste modelos")
Table 1: Índices de ajuste modelos
rn logLik AIC CAIC BIC entropy Res.Df Gsq
2 -27703.87 55511.75 55888.22 55836.22 0.9471951 3736 6995.394
3 -27399.61 54957.21 55529.16 55450.16 0.7933576 3709 6386.860
4 -27197.56 54607.12 55374.55 55268.55 0.7487319 3682 5982.768
5 -27045.36 54356.73 55319.63 55186.63 0.7940232 3655 5678.377
6 -26897.50 54115.00 55273.38 55113.38 0.8073065 3628 5382.646
7 -26763.76 53901.52 55255.38 55068.38 0.8196171 3601 5115.172
8 -26672.18 53772.37 55321.69 55107.69 0.7960240 3574 4932.012
9 -26600.22 53682.43 55427.24 55186.24 0.7901783 3547 4788.081
10 -26530.64 53597.28 55537.56 55269.56 0.8587455 3520 4648.923
Show code
best_model2<-
as.numeric(cbind.data.frame(rn=2:10,gof2$gtable) %>% dplyr::summarise(which.min(BIC)+1))

Presentamos el modelo con mejor ajuste

Show code
summary(eval(parse(text = paste0("lca2",best_model2)))) #

Call:
glca(formula = f_adj, data = mydata_preds3, nclass = 7, n.init = 500, 
    decreasing = T, testiter = 500, maxiter = 10000, seed = seed, 
    verbose = FALSE)

Manifest items : CAUSAL EDAD_MUJER_REC PUEBLO_ORIGINARIO_REC PAIS_ORIGEN_REC HITO1_EDAD_GEST_SEM_REC MACROZONA PREV_TRAMO_REC 
Covariates (Level 1) : outcome 

Categories for manifest items :
                        Y = 1 Y = 2 Y = 3 Y = 4 Y = 5 Y = 6
CAUSAL                      2     3     4                  
EDAD_MUJER_REC              1     2     3     4     5     6
PUEBLO_ORIGINARIO_REC       1     2     3                  
PAIS_ORIGEN_REC             1     2     3                  
HITO1_EDAD_GEST_SEM_REC     1     2     3     4     5     6
MACROZONA                   1     2     3     4     5     6
PREV_TRAMO_REC              1     2     3     4     5      

Model : Latent class analysis 

Number of latent classes : 7 
Number of observations : 3789 
Number of parameters : 187 

log-likelihood : -26763.76 
     G-squared : 5115.172 
           AIC : 53901.52 
           BIC : 55068.38 

Marginal prevalences for latent classes :
Class 1 Class 2 Class 3 Class 4 Class 5 Class 6 Class 7 
0.18988 0.04756 0.08593 0.31208 0.16780 0.04381 0.15293 

Class prevalences by group :
    Class 1 Class 2 Class 3 Class 4 Class 5 Class 6 Class 7
ALL 0.18988 0.04756 0.08593 0.31208  0.1678 0.04381 0.15293

Logistic regression coefficients :
            Class 1/7 Class 2/7 Class 3/7 Class 4/7 Class 5/7
(Intercept)    0.9933   -1.0814   -0.0580    1.9176   -0.3817
outcome1      -0.8885   -0.0947   -0.5814   -1.4400    0.5080
            Class 6/7
(Intercept)   -1.5881
outcome1       0.3634
Item-response probabilities :
CAUSAL 
         Y = 1  Y = 2  Y = 3
Class 1 0.9716 0.0284 0.0000
Class 2 0.7457 0.2543 0.0000
Class 3 0.3733 0.6267 0.0000
Class 4 0.1303 0.8697 0.0000
Class 5 0.0748 0.9252 0.0000
Class 6 0.0591 0.0000 0.9409
Class 7 0.0080 0.0000 0.9920
EDAD_MUJER_REC 
         Y = 1  Y = 2  Y = 3  Y = 4  Y = 5  Y = 6
Class 1 0.0007 0.0112 0.2433 0.4167 0.2923 0.0359
Class 2 0.0681 0.0362 0.2959 0.3122 0.2496 0.0380
Class 3 0.0000 0.0169 0.3047 0.3113 0.3103 0.0569
Class 4 0.0017 0.0464 0.3004 0.3202 0.2646 0.0666
Class 5 0.0034 0.0000 0.1020 0.3156 0.5052 0.0738
Class 6 0.0000 0.1787 0.3661 0.2472 0.1782 0.0298
Class 7 0.0017 0.3683 0.2944 0.2092 0.1071 0.0192
PUEBLO_ORIGINARIO_REC 
         Y = 1  Y = 2  Y = 3
Class 1 0.0844 0.8658 0.0498
Class 2 1.0000 0.0000 0.0000
Class 3 0.1309 0.8321 0.0369
Class 4 0.1286 0.8071 0.0642
Class 5 0.1382 0.8618 0.0000
Class 6 0.1292 0.8336 0.0372
Class 7 0.1519 0.8031 0.0449
PAIS_ORIGEN_REC 
         Y = 1  Y = 2  Y = 3
Class 1 0.0000 0.8834 0.1166
Class 2 0.0848 0.8242 0.0910
Class 3 0.0084 0.0000 0.9916
Class 4 0.0000 0.9692 0.0308
Class 5 0.0000 0.9219 0.0781
Class 6 0.0000 0.0000 1.0000
Class 7 0.0000 0.9917 0.0083
HITO1_EDAD_GEST_SEM_REC 
         Y = 1  Y = 2  Y = 3  Y = 4  Y = 5  Y = 6
Class 1 0.0749 0.2015 0.2306 0.4870 0.0000 0.0059
Class 2 0.0574 0.0000 0.0905 0.2468 0.2951 0.3101
Class 3 0.0077 0.0077 0.3147 0.3899 0.2207 0.0594
Class 4 0.0013 0.0000 0.2861 0.3710 0.2451 0.0964
Class 5 0.0161 0.0047 0.6078 0.2572 0.0992 0.0149
Class 6 0.0204 0.7946 0.1851 0.0000 0.0000 0.0000
Class 7 0.0088 0.7794 0.2119 0.0000 0.0000 0.0000
MACROZONA 
         Y = 1  Y = 2  Y = 3  Y = 4  Y = 5  Y = 6
Class 1 0.0000 0.3971 0.1643 0.1693 0.1109 0.1584
Class 2 0.0327 0.1899 0.3338 0.1624 0.0986 0.1826
Class 3 0.0000 0.6294 0.0846 0.0149 0.2518 0.0193
Class 4 0.0009 0.2832 0.1819 0.2321 0.0903 0.2116
Class 5 0.0000 0.6807 0.1169 0.0600 0.0713 0.0712
Class 6 0.0060 0.5588 0.0796 0.0161 0.2983 0.0412
Class 7 0.0052 0.3843 0.1752 0.1447 0.0860 0.2046
PREV_TRAMO_REC 
         Y = 1  Y = 2  Y = 3  Y = 4  Y = 5
Class 1 0.0000 0.0874 0.5601 0.3524 0.0000
Class 2 0.0243 0.0403 0.6488 0.2295 0.0571
Class 3 0.0066 0.0000 0.6076 0.2792 0.1065
Class 4 0.0013 0.0116 0.6462 0.3410 0.0000
Class 5 0.0000 0.5706 0.1718 0.2509 0.0068
Class 6 0.0299 0.0040 0.5209 0.1640 0.2812
Class 7 0.0017 0.0703 0.7221 0.2006 0.0053
Show code
save.image("data2_lca2_adj4.RData")
Show code
require(tidyverse)
sesion_info <- devtools::session_info()
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
) %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Variable' = 2, 'Percentage'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '50%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",#;
        "}")))