Show code

script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"

Show code

 $(document).ready(function() {
    $('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
    // onClick function for all plots (img's)
    $('img:not(.zoomImg)').click(function() {
      $('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
      $('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
    });
    // onClick function for zoomImg
    $('img.zoomImg').click(function() {
      $('.zoomDiv').css({opacity: '0', width: '0%'}); 
    });
  });

Show code

<script src="hideOutput.js"></script>

Show code

$(document).ready(function() {    
    $chunks = $('.fold');    
    $chunks.each(function () {      // add button to source code chunks     
    if ( $(this).hasClass('s') ) {       
        $('pre.r', this).prepend("<div class=\"showopt\">Show Source</div><br style=\"line-height:22px;\"/>");
            $('pre.r', this).children('code').attr('class', 'folded');     
            }      // add button to output chunks     
        if ( $(this).hasClass('o') ) {       
            $('pre:not(.r)', this).has('code').prepend("<div class=\"showopt\">Show Output</div><br style=\"line-height:22px;\"/>");       
            $('pre:not(.r)', this).children('code:not(r)').addClass('folded');        // add button to plots       
            $(this).find('img').wrap('<pre class=\"plot\"></pre>');       
            $('pre.plot', this).prepend("<div class=\"showopt\">Show Plot</div><br style=\"line-height:22px;\"/>");       
            $('pre.plot', this).children('img').addClass('folded');      
            }   
});    // hide all chunks when document is loaded   
    $('.folded').css('display', 'none')    // function to toggle the visibility   
    $('.showopt').click(function() {     
            var label = $(this).html();     
            if (label.indexOf("Show") >= 0) {       
                $(this).html(label.replace("Show", "Hide"));     
            } else {
              $(this).html(label.replace("Hide", "Show"));     
            }     
    $(this).siblings('code, img').slideToggle('fast', 'swing');   
    }); 
});

Cargamos los datos

Show code

rm(list = ls());gc()

          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  653590   35    1333900 71.3   931633 49.8
Vcells 1177589    9    8388608 64.0  2075161 15.9

Show code

print(Sys.time())

[1] "2023-05-12 10:17:18 -04"

Show code

load("data2.RData")

Cargamos los paquetes

Show code

knitr::opts_chunk$set(echo = TRUE)

if(!require(poLCA)){install.packages("poLCA")}
if(!require(poLCAParallel)){devtools::install_github("QMUL/poLCAParallel@package")}
if(!require(compareGroups)){install.packages("compareGroups")}
if(!require(parallel)){install.packages("parallel")}
if(!require(Hmisc)){install.packages("Hmisc")}
if(!require(tidyverse)){install.packages("tidyverse")}
try(if(!require(sjPlot)){install.packages("sjPlot")})
if(!require(emmeans)){install.packages("emmeans")}
if(!require(nnet)){install.packages("nnet")}
if(!require(here)){install.packages("here")}
if(!require(doParallel)){install.packages("doParallel")}
if(!require(progress)){install.packages("progress")}
if(!require(caret)){install.packages("caret")}
if(!require(rpart)){install.packages("rpart")}
if(!require(rpart.plot)){install.packages("rpart.plot")}
if(!require(partykit)){install.packages("partykit")}
if(!require(randomForest)){install.packages("randomForest")}
if(!require(ggcorrplot)){install.packages("ggcorrplot")}
if(!require(polycor)){install.packages("polycor")}
if(!require(tableone)){install.packages("tableone")}
if(!require(DiagrammeR)){install.packages("DiagrammeR")}
if(!require(rsvg)){install.packages("rsvg")}
if(!require(DiagrammeRsvg)){install.packages("DiagrammeRsvg")}
if(!require(webshot)){install.packages("webshot")}
if(!require(htmlwidgets)){install.packages("htmlwidgets")}

#if(!require(poLCA)){githubinstall::gh_install_packages("poLCA", ref = github_pull("14"))}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
lca_dir<-here::here()
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#
#  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

tryNA <- function(x){
    x <- try(x)
    if(inherits(x,'try-error')) return(NA)
    x
}

#https://rdrr.io/github/hyunsooseol/snowRMM/src/R/lca.b.R
#https://github.com/dlinzer/poLCA/issues/7

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#' Bivariate residuals for latent class models
#' 
#' Calculate the "bivariate residuals" (BVRs) between pairs of variables 
#' in a latent class model.
#' 
#' This function compares the model-implied (expected) counts in the crosstables
#' of all pairs of observed dependent variables to the observed counts. For each
#' pair, it calculates a "chi-square" statistic,
#' 
#' \deqn{\text{BVR} = \sum_{j, j'} \frac{(n_{jj'} - e_{jj'})^2}{e_{jj'}}},
#' 
#' where \eqn{n_{jj'}} are the observed counts for categories \eqn{j} and \eqn{j'} 
#' of the variables being crosstabulated, and \eqn{e_{jj'}} are
#' the expected counts under the latent class model. 
#' 
#' Note that the BVR does not follow an asymptotic chi-square distribution and
#' for accurate p-values, parametric bootstrapping is necessary (Oberski et al. 2013).
#' 
#' @param fit A poLCA fit object
#' @param tol Optional: tolerance for small expected counts
#' @param rescale_to_df Optional: whether to divide the pairwise "chi-square" values by 
#' the degrees of freedom of the local crosstable. Default is TRUE.
#' @return The table of bivariate residuals
#' @author Daniel Oberski (daniel.oberski@gmail.com)
#' @seealso \code{\link{poLCA}} for fitting the latent class model.
#' @references 
#' Oberski, DL, Van Kollenburg, GH and Vermunt, JK (2013). 
#'   A Monte Carlo evaluation of three methods to detect local dependence in binary data latent class models. 
#'   Advances in Data Analysis and Classification 7 (3), 267-279.
#' @examples
#' data(values)
#' f <- cbind(A, B, C, D) ~ 1
#' M0 <- poLCA(f,values, nclass=1, verbose = FALSE) 
#' bvr(M0) # 12.4, 5.7, 8.3, 15.6, ... 
bvr <- function(fit, tol = 1e-3, rescale_to_df = TRUE) {
  stopifnot(class(fit) == "poLCA")

  ov_names <- names(fit$predcell)[1:(ncol(fit$predcell) - 2)]
  ov_combn <- combn(ov_names, 2)

  get_bvr <- function(ov_pair) {
    form_obs <- as.formula(paste0("observed ~ ", ov_pair[1], " + ", ov_pair[2]))
    form_exp <- as.formula(paste0("expected ~ ", ov_pair[1], " + ", ov_pair[2]))

    counts_obs <- xtabs(form_obs, data = fit$predcell)
    counts_exp <- xtabs(form_exp, data = fit$predcell)
    counts_exp <- ifelse(counts_exp < tol, tol, counts_exp) # Prevent Inf/NaN

    bvr_df <- prod(dim(counts_exp) - 1)
    bvr_value <- sum((counts_obs - counts_exp)^2 / counts_exp)

    if(rescale_to_df) bvr_value <- bvr_value / bvr_df

    attr(bvr_value, "df") <- bvr_df

    bvr_value
  }

  bvr_pairs <- apply(ov_combn, 2, get_bvr)

  attr(bvr_pairs, "rescale_to_df") <- rescale_to_df
  attr(bvr_pairs, "class") <- "dist"
  attr(bvr_pairs, "Size") <- length(ov_names)
  attr(bvr_pairs, "Labels") <- ov_names
  attr(bvr_pairs, "Diag") <- FALSE
  attr(bvr_pairs, "Upper") <- FALSE

  bvr_pairs
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
poLCA.entropy.fix <- function (lc)
{
  K.j <- sapply(lc$probs, ncol)
  fullcell <- expand.grid(sapply(K.j, seq, from = 1))
  P.c <- poLCA.predcell(lc, fullcell)
  return(-sum(P.c * log(P.c), na.rm = TRUE))
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Calculate entropy R2 for poLCA model

# MIT license
# Author: Daniel Oberski
# Input: result of a poLCA model fit
# Output: entropy R^2 statistic (Vermunt & Magidson, 2013, p. 71)
# See: daob.nl/wp-content/uploads/2015/07/ESRA-course-slides.pdf
# And: https://www.statisticalinnovations.com/wp-content/uploads/LGtecnical.pdf
machine_tolerance <- sqrt(.Machine$double.eps)
entropy.R2 <- function(fit) {
  entropy <- function(p) {
    p <- p[p > machine_tolerance] # since Lim_{p->0} p log(p) = 0
    sum(-p * log(p))
  }
  error_prior <- entropy(fit$P) # Class proportions
  error_post <- mean(apply(fit$posterior, 1, entropy))
  R2_entropy <- (error_prior - error_post) / error_prior
  R2_entropy
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#http://researchdata.gla.ac.uk/879/1/Survey_data_processed_using_R.pdf
##Function to plot variable probabilites by latent class

## Function to undertake chisquare analayis and plot graphs of residuals and contributions
chisquaretest.predictions.function <-
 function(indfactor.data,
   predclass.data,
   noclasses,
   pitem,
   gitem,
   chirows,
   chicols) {
   chisquare.results <- chisq.test(indfactor.data, predclass.data)
   residuals.data <- chisquare.results$residuals
   colnames(residuals.data) <- chicols
   rownames(residuals.data) <- chirows
     title.text <-
       paste(
       "Residuals: chi Square Crosstabulation of\n",
       pitem,
       "and",
       gitem,
       "\n(Chisquare =",
       round(chisquare.results$statistic, 3),
       " p <",
       round(chisquare.results$p.value, 3),
       ")",
       sep = " "
       )
     corrplot(
       residuals.data,
       is.cor = FALSE,
       title = title.text,
       mar = c(0, 0, 4, 0)
       )
     contrib.data <-
     100 * residuals.data ^ 2 / chisquare.results$statistic
     round(contrib.data, 3)
     colnames(contrib.data) <- chicols
     rownames(contrib.data) <- chirows
     title.text <-
     paste(
       "Contributions: chi Square Crosstabulation of\n",
       pitem,
       "and",
       gitem,
       "\n(Chisquare =",
       round(chisquare.results$statistic, 3),
       " p <",
       round(chisquare.results$p.value, 3),
       ")",
       sep = " "
       )
     corrplot(
       contrib.data,
       is.cor = FALSE,
       title = title.text,
       mar = c(0, 0, 4, 0)
       )
     return(chisquare.results)
 }
##Funciton for Cramers V test
cv.test = function(x, y) {
   CV = sqrt(chisq.test(x, y, correct = FALSE)$statistic /
   (length(x) * (min(
   length(unique(x)), length(unique(y))
   ) - 1)))
   print.noquote("Cramér V / Phi:"))
   return(as.numeric(CV))
  }

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

if(.Platform$OS.type == "windows") withAutoprint({
  memory.size()
  memory.size(TRUE)
  memory.limit(size=56000)
})

path<-try(dirname(rstudioapi::getSourceEditorContext()$path))

options(knitr.kable.NA = '')


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      # record the current time before each chunk
      now <<- Sys.time()
    } else {
      # calculate the time difference after a chunk
      res <- ifelse(difftime(Sys.time(), now)>(60^2),difftime(Sys.time(), now)/(60^2),difftime(Sys.time(), now)/(60^1))
      # return a character string to show the time
      x<-ifelse(difftime(Sys.time(), now)>(60^2),paste("Time for this code chunk to run:", round(res,1), "hours"),paste("Time for this code chunk to run:", round(res,1), "minutes"))
      paste('<div class="message">', gsub('##', '\n', x),'</div>', sep = '\n')
    }
  }
}))
knitr::opts_chunk$set(time_it = TRUE)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)
#
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

Definimos ciertas constantes

Show code

clus_iter= 500#500 #500
n_thread <- parallel::detectCores()
nrep <- clus_iter # number of different initial values (could be n_thread too)
n_class_max <- 12 # maximum number of classes to investigate
n_bootstrap <- 100#00 #30 # 50 number of bootstrap samples
print(n_thread)

[1] 8

Time for this code chunk to run: 0 minutes

Análisis de clases latentes

Especificar modelo

Show code

library(DiagrammeR) #⋉
gr_lca<-
DiagrammeR::grViz("
digraph flowchart {
    fontname='Comic Sans MS'

  # Nodes
  subgraph samelevel {
    ANIO [label = 'AÃ±o',fontsize=10,shape = box]
    CAUSAL [label = 'Causal',fontsize=10,shape = box]
    MUJER_REC [label = 'Sexo\n(mujer)',fontsize=10,shape = box]
    EDAD_MUJER_REC [label = 'Edad\nmujer',fontsize=10,shape = box]
    HITO1_EDAD_GEST_SEM_REC [label = 'Edad\nGestacional\nHito 1',fontsize=10,shape = box]
    MACROZONA [label = 'Macrozona',fontsize=10,shape = box]
    PAIS_ORIGEN_REC [label = 'PaÃs de\norigen',fontsize=10,shape = box]
    PREV_TRAMO_REC [label = 'PrevisiÃ³n y\ntramo',fontsize=10,shape = box]

  {rank=same; rankdir= 'TB'; CAUSAL MUJER_REC EDAD_MUJER_REC HITO1_EDAD_GEST_SEM_REC MACROZONA PAIS_ORIGEN_REC PREV_TRAMO_REC ANIO}
  }
  LCA [label= 'Clases\nlatentes', shape= circle, style=filled, color=lightgrey, fontsize=10]
  # Nodes
  subgraph {
   LCA ->  {CAUSAL MUJER_REC EDAD_MUJER_REC HITO1_EDAD_GEST_SEM_REC MACROZONA PAIS_ORIGEN_REC PREV_TRAMO_REC ANIO} [rank=same; rankdir= 'TB'] 
}
#   subgraph {
#    LCA -> inter [minlen=14] #minlen es necesario para correr arrowhead = none; 
#   {rank=same; LCA inter [rankdir='LR']}; #; 
# }
}")#, width = 1200, height = 900
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3733703/
#Cohort matching on a variable associated with both outcome and censoring
#Cohort matching on a confounder. We let A denote an exposure, Y denote an outcome, and C denote a confounder and matching variable. The variable S indicates whether an individual in the source population is selected for the matched study (1: selected, 0: not selected). See Section 2-7 for details.
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7064555/
DPI = 1200
WidthCM = 21
HeightCM = 8

gr_lca %>%
  export_svg %>% charToRaw %>% rsvg_pdf("_flowchart_lca_sin_po.pdf")

gr_lca %>% export_svg()%>%charToRaw %>% rsvg(width = WidthCM *(DPI/2.54), height = HeightCM *(DPI/2.54)) %>% png::writePNG("_flowchart_lca0_sin_po.png")

htmlwidgets::saveWidget(gr_lca, "_flowchart_lca_sin_po.html")
webshot::webshot("_flowchart_lca_sin_po.html", "_flowchart_lca_sin_po.png",vwidth = 1200, vheight = 900,
        zoom = 2)

Figure 1: Gráfico esquemático

Time for this code chunk to run: 0.4 minutes

Los valores ceros se declararon datos perdidos.

Show code

#table(data2$MACROZONA,data2$REGION_RESIDENCIA, exclude=NULL)
preds <- c("AÑO", "CAUSAL","EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC", "MACROZONA", "PREV_TRAMO_REC")

#c("AÑO","MES_NUM",    "HITO3_COMPLICACION_POST_IVE", "HITO3_CONDICION_MUJER_POST_IVE", "HITO3_TIPO_ATENCION",  "HITO3_SERV_SALUD_ESTABLECIMIENTO", "HITO4_MUJER_ACEPTA_ACOM"),

mydata_preds <- data2%>% 
  #18-24 es el 18%, entre 25 y 35 es el 42% y el resto q es 30% som los mayores a 35
  #  NA=1;  >=4 & <14 =2; >=14 & <28=2; >=28=3 ???
  dplyr::mutate(EDAD_MUJER_REC= dplyr::case_when(EDAD_MUJER<18~"1.<18", EDAD_MUJER<17 & EDAD_MUJER<25~"2.18-24", EDAD_MUJER<24 & EDAD_MUJER<=35~"3.25-35",EDAD_MUJER>35~"4.>35"), HITO1_EDAD_GESTACIONAL_SEMANAS= dplyr::case_when(HITO1_EDAD_GESTACIONAL_SEMANAS>=4 & HITO1_EDAD_GESTACIONAL_SEMANAS<14~"1.4-13 semanas", HITO1_EDAD_GESTACIONAL_SEMANAS>=14 & HITO1_EDAD_GESTACIONAL_SEMANAS<28~"2.14-27 semanas", HITO1_EDAD_GESTACIONAL_SEMANAS>=28~ "3.>28 semanas"))%>% 
  dplyr::mutate(across(preds, ~ as.numeric(factor(.))+1))%>% dplyr::mutate(across(preds, ~ dplyr::case_when(is.na(.)~ 1, T~ .))) 

#comprobar 
#table(mydata_preds$sus_ini_mod_mvv, data2$sus_ini_mod_mvv, exclude=NULL)

f_preds<-cbind(AÑO, CAUSAL, EDAD_MUJER_REC, PAIS_ORIGEN_REC, HITO1_EDAD_GEST_SEM_REC, MACROZONA, PREV_TRAMO_REC)~ 1

Time for this code chunk to run: 0 minutes

Exploración

Uno de los supuestos del análisis de clase latentes es que hay separación entre las distitnas covariables. Se contrastó la colinealidad mediante una matriz de correlaciones heterogéneas.

Show code

#glimpse(mydata_preds[,preds])
mydata_preds_cat <- data.table::data.table(mydata_preds[,preds]) %>% dplyr::mutate_all(~as.character(.))

tiempo_antes_hetcor<-Sys.time()
hetcor_mat<-polycor::hetcor(mydata_preds_cat, ML = T, std.err =T, use="pairwise.complete.obs", bins=5, pd=TRUE)
tiempo_despues_hetcor<-Sys.time()
tiempo_hetcor<-tiempo_despues_hetcor-tiempo_antes_hetcor

hetcor_mat$tests[is.na(hetcor_mat$tests)]<-1

ggcorrplot<-
ggcorrplot::ggcorrplot(hetcor_mat$correlations,
           ggtheme = ggplot2::theme_void,
           insig = "blank",
           pch=1,
           pch.cex=3,
           tl.srt = 45, 
           #pch="ns",
            p.mat = hetcor_mat$tests, #  replacement has 144 rows, data has 169
            #type = "lower",
           colors = c("#6D9EC1", "white", "#E46726"), 
           tl.cex=8,
           lab=F)+
  #scale_x_discrete(labels = var_lbls_p345, drop = F) +
  #scale_y_discrete(labels = var_lbls_p345, drop = F) +
 #theme(axis.text.x = element_blank())+
  theme(axis.text.y = element_text(size=7.5,color ="black", hjust = 1))+
  #theme(axis.text.y = element_blank())+
  theme(legend.position="bottom")
ggcorrplot

Show code

#ggplotly(ggcorrplot, height = 800, width=800)%>% 
#  layout(xaxis= list(showticklabels = FALSE)) %>% 
# layout(annotations = 
# list(x = .1, y = -0.031, text = "", 
#      showarrow = F, xref='paper', yref='paper', 
#      #xanchor='center', yanchor='auto', xshift=0, yshift=-0,
#      font=list(size=11, color="darkblue"))
# )

Time for this code chunk to run: 0.5 minutes

Show code

invisible("EXPORTAR TABLA DE CORRELACIÓN POLICÓRICA")
hetcor_df<-
reshape2::melt(hetcor_mat$correlations) %>% 
  dplyr::left_join(reshape2::melt(hetcor_mat$tests), by= c("Var1", "Var2")) %>% 
  dplyr::filter(Var1!=Var2) %>% 
  dplyr::rename("Polychoric Correlation"="value.x", "p value"="value.y") %>% 
  dplyr::mutate(`p value`= sprintf("%0.4f",`p value`), `Polychoric Correlation`= sprintf("%2.2f",`Polychoric Correlation`),  ya= abs(parse_number(`Polychoric Correlation`)))%>% 
  dplyr::arrange(desc(ya)) %>% 
  dplyr::select(-ya) 

hetcor_df %>% rio::export("_tab_poly_corr_sin_po.xlsx")

Time for this code chunk to run: 0.1 minutes

Modificamos la base de datos para incluir la variable resultado.

Show code

#dropped region name because it was too different,many categories
preds2 <-c(setdiff(preds,""),"HITO2_DECISION_MUJER_IVE")  

mydata_preds2 <- data2%>% 
    dplyr::mutate(EDAD_MUJER_REC= factor(dplyr::case_when(EDAD_MUJER<18~"1.<18", EDAD_MUJER>=18 & EDAD_MUJER<25~"2.18-24", EDAD_MUJER>=25 & EDAD_MUJER<35~"3.25-35",EDAD_MUJER>=35~"4.>=35")), HITO1_EDAD_GEST_SEM_REC= factor(dplyr::case_when(HITO1_EDAD_GESTACIONAL_SEMANAS<14~"1.0-13 semanas", HITO1_EDAD_GESTACIONAL_SEMANAS>=14 & HITO1_EDAD_GESTACIONAL_SEMANAS<28~"2.14-27 semanas", HITO1_EDAD_GESTACIONAL_SEMANAS>=28~ "3.>=28 semanas")))%>% dplyr::mutate(across(preds, ~ as.numeric(factor(.))+1))%>% dplyr::mutate(across(preds, ~ dplyr::case_when(is.na(.)~ 1, T~ .)))  %>% dplyr::select(any_of(preds2)) %>% dplyr::mutate(outcome= factor(as.numeric(grepl('INTERRUMPIR', HITO2_DECISION_MUJER_IVE)))) 

#data.table::data.table(.)
#
#table(mydata_preds2$HITO2_DECISION_MUJER_IVE)
#lapply(preds2, function(p) {prop.table(table(mydata_preds2[p]))})

Time for this code chunk to run: 0.1 minutes

Árboles aleatorios

Luego hicimos una regresión de árbol para analizar cómo las variables se relacionan.

Show code

#define train and test samples
set.seed(2125)
ind <- sample(x = 2, size = nrow(mydata_preds2), replace = T, prob = c(0.7, 0.3))
train <- mydata_preds2[ind == 1, ]#%>% dplyr::mutate_if(is.numeric, ~as.character(.)) #si convierto a caracter o factor no entrega nada
test <- mydata_preds2[ind == 2, ]#%>% dplyr::mutate_if(is.numeric, ~as.character(.))

form_rf <- as.formula(paste("outcome ~ ", paste(preds, collapse="+ ")))

mod2.2 <- train(form_rf, data = train, method = 'rpart',
                tuneLength = 30, trControl = trainControl(method = 'repeatedcv',
                                                          repeats = 5))

mod2.2

CART 

2648 samples
   7 predictor
   2 classes: '0', '1' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 2383, 2384, 2383, 2383, 2383, 2384, ... 
Resampling results across tuning parameters:

  cp            Accuracy   Kappa     
  0.0000000000  0.8203902  0.07249733
  0.0001205691  0.8203902  0.07249733
  0.0002411382  0.8203902  0.07249733
  0.0003617073  0.8204657  0.07207823
  0.0004822763  0.8215983  0.07104289
  0.0006028454  0.8227304  0.06846798
  0.0007234145  0.8232590  0.06560431
  0.0008439836  0.8242407  0.06563069
  0.0009645527  0.8269617  0.06511791
  0.0010851218  0.8283225  0.06387545
  0.0012056909  0.8293056  0.06429539
  0.0013262599  0.8310420  0.06403939
  0.0014468290  0.8316464  0.06522963
  0.0015673981  0.8325540  0.06581119
  0.0016879672  0.8327050  0.06613697
  0.0018085363  0.8328559  0.06432302
  0.0019291054  0.8328559  0.06432302
  0.0020496745  0.8329314  0.06390595
  0.0021702435  0.8328556  0.06202064
  0.0022908126  0.8328556  0.06202064
  0.0024113817  0.8328556  0.06202064
  0.0025319508  0.8328556  0.06202064
  0.0026525199  0.8345929  0.04941893
  0.0027730890  0.8345929  0.04941893
  0.0028936581  0.8345929  0.04941893
  0.0030142272  0.8345929  0.04941893
  0.0031347962  0.8344414  0.04684888
  0.0032553653  0.8345172  0.03641888
  0.0033759344  0.8345172  0.03641888
  0.0034965035  0.8349700  0.03510127

Accuracy was used to select the optimal model using the
 largest value.
The final value used for the model was cp = 0.003496503.

Show code

mod2.2$finalModel

n= 2648 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 2648 429 1 (0.1620091 0.8379909) *

Time for this code chunk to run: 0.1 minutes

Graficamos los resultados

Show code

rpart.plot(mod2.2$finalModel)

Time for this code chunk to run: 0 minutes

Recodificación datos

Definimos los datos

Show code

mydata_preds3 <-
  mydata_preds2 %>% 
#dplyr::mutate_if(is.numeric, ~as.character(.)) %>% #si convierto a caracter entrega errores
  data.table::data.table(.)

mydata_preds3 %>% rio::export("mydata_preds3_2023_05_07.dta")

Time for this code chunk to run: 0 minutes

Show code

run_tableone <- function(df, listVars, catVars, strata) {
  capture.output(x <- print(CreateTableOne(vars = listVars, data = df, factorVars = catVars, strata=strata,addOverall = T, includeNA =T),showAllLevels = TRUE))
  x<-
  as.data.frame(x) %>%
    add_rownames("Name")
  x$Name <- stringr::str_replace(x$Name, "^X$", NA_character_)
  x$Name <- stringr::str_replace(x$Name, "^X\\.", NA_character_)
  x$Name <- stringr::str_replace_all(x$Name, "\\.{3,}", "")
  x <- x %>% tidyr::fill(Name, .direction = "down") 
}

#| class-output: controlly
tab2<-
    CreateTableOne(vars =c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC","HITO2_DECISION_MUJER_IVE"),
                 #transforma la variable en numerico
                 data= mydata_preds3, 
                 factorVars=c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC"),
                 strata= "HITO2_DECISION_MUJER_IVE",addOverall = T, includeNA =T)


run_tableone(listVars =c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC","HITO2_DECISION_MUJER_IVE"), df= mydata_preds3, catVars= c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC"),strata= "HITO2_DECISION_MUJER_IVE") %>%  knitr::kable("markdown", caption="Descriptivos (acotado)")

Table 1: Descriptivos (acotado)
Name	level	Overall	CONTINUAR EL EMBARAZO	INTERRUMPIR EL EMBARAZO	NO APLICA, INSCONSCIENTE	p
n		3789	593	3183	13
CAUSAL	2	1171 (30.9)	190 ( 32.0)	968 ( 30.4)	13 (100.0)	<0.001
CAUSAL	3	1887 (49.8)	346 ( 58.3)	1541 ( 48.4)	0 ( 0.0)
CAUSAL	4	731 (19.3)	57 ( 9.6)	674 ( 21.2)	0 ( 0.0)
EDAD_MUJER_REC	1	18 ( 0.5)	2 ( 0.3)	16 ( 0.5)	0 ( 0.0)	0.099
EDAD_MUJER_REC	2	269 ( 7.1)	55 ( 9.3)	214 ( 6.7)	0 ( 0.0)
EDAD_MUJER_REC	3	720 (19.0)	100 ( 16.9)	618 ( 19.4)	2 ( 15.4)
EDAD_MUJER_REC	4	1646 (43.4)	236 ( 39.8)	1403 ( 44.1)	7 ( 53.8)
EDAD_MUJER_REC	5	1136 (30.0)	200 ( 33.7)	932 ( 29.3)	4 ( 30.8)
PAIS_ORIGEN_REC	1	18 ( 0.5)	5 ( 0.8)	13 ( 0.4)	0 ( 0.0)	0.030
PAIS_ORIGEN_REC	2	3091 (81.6)	507 ( 85.5)	2573 ( 80.8)	11 ( 84.6)
PAIS_ORIGEN_REC	3	680 (17.9)	81 ( 13.7)	597 ( 18.8)	2 ( 15.4)
HITO1_EDAD_GEST_SEM_REC	1	87 ( 2.3)	11 ( 1.9)	75 ( 2.4)	1 ( 7.7)	<0.001
HITO1_EDAD_GEST_SEM_REC	2	1328 (35.0)	127 ( 21.4)	1200 ( 37.7)	1 ( 7.7)
HITO1_EDAD_GEST_SEM_REC	3	2008 (53.0)	338 ( 57.0)	1665 ( 52.3)	5 ( 38.5)
HITO1_EDAD_GEST_SEM_REC	4	366 ( 9.7)	117 ( 19.7)	243 ( 7.6)	6 ( 46.2)
MACROZONA	1	11 ( 0.3)	3 ( 0.5)	8 ( 0.3)	0 ( 0.0)	<0.001
MACROZONA	2	1608 (42.4)	195 ( 32.9)	1410 ( 44.3)	3 ( 23.1)
MACROZONA	3	610 (16.1)	66 ( 11.1)	536 ( 16.8)	8 ( 61.5)
MACROZONA	4	555 (14.6)	153 ( 25.8)	401 ( 12.6)	1 ( 7.7)
MACROZONA	5	431 (11.4)	69 ( 11.6)	361 ( 11.3)	1 ( 7.7)
MACROZONA	6	574 (15.1)	107 ( 18.0)	467 ( 14.7)	0 ( 0.0)
AÑO	2	732 (19.3)	115 ( 19.4)	617 ( 19.4)	0 ( 0.0)	<0.001
AÑO	3	818 (21.6)	149 ( 25.1)	669 ( 21.0)	0 ( 0.0)
AÑO	4	662 (17.5)	103 ( 17.4)	559 ( 17.6)	0 ( 0.0)
AÑO	5	820 (21.6)	121 ( 20.4)	689 ( 21.6)	10 ( 76.9)
AÑO	6	757 (20.0)	105 ( 17.7)	649 ( 20.4)	3 ( 23.1)
PREV_TRAMO_REC	1	14 ( 0.4)	4 ( 0.7)	10 ( 0.3)	0 ( 0.0)	<0.001
PREV_TRAMO_REC	2	488 (12.9)	37 ( 6.2)	451 ( 14.2)	0 ( 0.0)
PREV_TRAMO_REC	3	2096 (55.3)	373 ( 62.9)	1714 ( 53.8)	9 ( 69.2)
PREV_TRAMO_REC	4	1092 (28.8)	176 ( 29.7)	913 ( 28.7)	3 ( 23.1)
PREV_TRAMO_REC	5	99 ( 2.6)	3 ( 0.5)	95 ( 3.0)	1 ( 7.7)
HITO2_DECISION_MUJER_IVE	CONTINUAR EL EMBARAZO	593 (15.7)	593 (100.0)	0 ( 0.0)	0 ( 0.0)	<0.001
HITO2_DECISION_MUJER_IVE	INTERRUMPIR EL EMBARAZO	3183 (84.0)	0 ( 0.0)	3183 (100.0)	0 ( 0.0)
HITO2_DECISION_MUJER_IVE	NO APLICA, INSCONSCIENTE	13 ( 0.3)	0 ( 0.0)	0 ( 0.0)	13 (100.0)

Show code

run_tableone(listVars =c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC","HITO2_DECISION_MUJER_IVE"), df= mydata_preds3, catVars= c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC"),strata= "HITO2_DECISION_MUJER_IVE") %>% rio::export("tabla12_update.xlsx")

Time for this code chunk to run: 0.2 minutes

Show code

#<div class="fold o"> 
paste0("Con Estamento")

mydata_preds3 %>% 
    dplyr::select(HITO2_DECISION_MUJER_IVE, c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC")) %>% 
    tidyr::gather(variable,measure, -HITO2_DECISION_MUJER_IVE) %>% 
    dplyr::group_by(variable) %>%
    dplyr::do(chisq.test(.$HITO2_DECISION_MUJER_IVE, .$measure) %>% broom::tidy()) %>%
    dplyr::mutate(p.value=ifelse(p.value<.001,"<0.001",sprintf("%1.3f",p.value))) %>% 
    dplyr::mutate(statistic=sprintf("%2.0f",statistic)) %>% 
    dplyr::mutate(report=paste0("X²(",parameter,", ",nrow(mydata_preds3),")=",statistic,"; p", ifelse(p.value=="<0.001",p.value, paste0("=",p.value)))) %>%
    dplyr::mutate(report=sub("0\\.","0,",sub("\\.",",",report))) 

# explanatory63 = c("p40_istas_soporte", "p40_cond_trab_istas_10_p40_rec", "ratio_nec_insuf_mask2", "p8_exp_c19_cerca_cont_rec", "p36_cond_trab_reasig_rec", "p26_cond_trab_ent_protec_3_p26_rec", "p45_demo_sexo", "estamento")

fisher_1<-
fisher.test(table(mydata_preds3$p8_exp_c19_cerca_cont, 
                  mydata_preds3$HITO2_DECISION_MUJER_IVE), simulate.p.value=T, B=10000)

mydata_preds3 %>% 
    dplyr::select(HITO2_DECISION_MUJER_IVE, c("CAUSAL", "EDAD_MUJER_REC", "PAIS_ORIGEN_REC", "HITO1_EDAD_GEST_SEM_REC","MACROZONA", "AÑO", "PREV_TRAMO_REC")) %>% 
    tidyr::gather(variable,measure, -HITO2_DECISION_MUJER_IVE) %>% 
    dplyr::group_by(variable) %>%
    dplyr::do(fisher.test(.$HITO2_DECISION_MUJER_IVE, .$measure, workspace = 2e10, simulate.p.value = T, B=1e5) %>% broom::tidy()) 
# 
paste0("H(",kruskal.test(p40_istas_soporte ~ estamento, data=mydata_preds3)$parameter,")=",
       round(kruskal.test(p40_istas_soporte ~ estamento, data=mydata_preds3)$statistic,1), ", p=",
       round(kruskal.test(p40_istas_soporte ~ estamento, data=mydata_preds3)$p.value,3)) %>%
  copy_names()

#_#_#_#_#_#_#_#_

paste0("Con sexo")


paste0("Wlicoxon test, sexo-soporte ")
paste0("W= ",round(broom::tidy(wilcox.test(p40_istas_soporte ~ p45_demo_sexo, data=mydata_preds3))$statistic/sqrt(nrow(mydata_preds3)),0),", p= ",round(broom::tidy(wilcox.test(p40_istas_soporte ~ p45_demo_sexo, data=mydata_preds3))$p.value,3))

Time for this code chunk to run: 0.2 minutes

Así se ven los datos como los usa poLCA

Corremos una versión paralelizada de poLCA.

Análisis

Show code

#Biemer, P. P., & Wiesen, C. (2002). Measurement error evaluation of self-reported drug use: a latent class analysis of the US National Household Survey on Drug Abuse. Journal of the Royal Statistical Society: Series A (Statistics in Society), 165(1), 97–119. doi:10.1111/1467-985x.00612  
#lca_entropia(x="ppio", seed= 2125, k= 8, f= f_preds, dat= mydata_preds, nbr_repet= 30, na_rm= T)
#3
#<div style="border: 1px solid #ddd; padding: 5px; overflow-y: scroll; height:400px; overflow-x: scroll; width:100%">
# f is the selected variables
# dat is the data
# nb_var is the number of selected variables
# k is the number of latent class generated
# nbr_repet is the number of repetition to  
# reach the convergence of EM algorithm
# x es el código para las variables de los modelos
#seed es el numero random para las semillas. ej: 4345.
#Modo de calcular el mejor modelo.
#z_ # 
#2023-01-20
#https://github.com/QMUL/poLCAParallel/blob/master/exec/3_blrt.R
#0h s
seed<-2125
old <- Sys.time()

require(progress)

set.seed(seed)
model_array <- list()  # Initialize an empty list to store the results
pb <- progress_bar$new(total = n_class_max, message_class = "Running poLCA")

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
for (k in 1:n_class_max) {
  nrep_int <- nrep  # Initialize nrep to a reasonable value
  while (nrep_int >= 1) {  # Try running poLCA with decreasing nrep until nrep reaches 1
    tryCatch({
      set.seed(seed)
      mod <- poLCAParallel::poLCA(
        f_preds, 
        mydata_preds3,
        nclass = k, 
        nrep = nrep_int, 
        maxiter = 1e4,
        n.thread = 12,
        verbose = FALSE
      )
      model_array[[k]] <- mod  # Store the result if no error occurs
      break  # Exit the loop if poLCA succeeds
    }, error = function(e) {
      message(paste("Error in poLCA for k =", k, ", nrep =", nrep_int, ":", conditionMessage(e)))
      nrep_int <- nrep_int / 2  # Reduce nrep by half if poLCA fails
    })
  }
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_  
  pb$tick()  # Increment the progress bar
  cat(sprintf("\r%d%% completed", round(k/n_class_max*100)))  # Print progress percentage
  Sys.sleep(.05)
}


8% completed
17% completed
25% completed
33% completed
42% completed
50% completed
58% completed
67% completed
75% completed
83% completed
92% completed
100% completed

Show code

pb$terminate()  # Close the progress bar
cat(': Done')  # Print "Done" message

: Done

Show code

model_array_ppio<-model_array

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#BOOTSTRAP#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

new_med<-(Sys.time())
paste0("The model took ",round(new_med-old,2)," until every LCA was computed")

[1] "The model took 22.85 until every LCA was computed"

Time for this code chunk to run: 0.4 minutes

Luego calculamos la razón de verosimilitud mediante remuestreo bootstrap (BLRT) entre los distintos modelos con el que asume una clase menos.

Show code

# store p values for each nclass, 1 to n_class_max
# store 0 for 1 number of class, ie this says you cannot have zero number of
# classes
p_value_array <- c(0)
# for all number of classes investigated:
#   - store the log likelihood ratio
#   - store all bootstrap samples log likelihoods ratios
fitted_log_ratio_array <- rep(NaN, n_class_max)
bootstrap_log_ratio_array <- list()

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# do the bootstrap likelihood ratio test for each number of classes
for (nclass in 2:n_class_max) {

  # get the null and alt models
  # these are models with one number of class differences
  null_model <- model_array[[nclass - 1]]
  alt_model <- model_array[[nclass]]

  # for each bootstrap sample, store the log likelihood ratio here
  bootstrap_results <- poLCAParallel::blrt(
    null_model, alt_model,
    n_bootstrap, n_thread, nrep
  )

  # log likelihood ratio to compare the two models
  fitted_log_ratio_array[nclass] <- bootstrap_results[["fitted_log_ratio"]]
  # store the log likelihoods ratios for all bootstrap samples
  bootstrap_log_ratio_array[[nclass]] <-
    bootstrap_results[["bootstrap_log_ratio"]]
  # store the p value for this nclass
  p_value_array <- c(p_value_array, bootstrap_results[["p_value"]])
  
  #progress bar
  cat(paste0(round(nclass / n_class_max * 100), '% completed'))
  Sys.sleep(.05)
  if (nclass == n_class_max) cat(': Done')
  else cat('\014')
}

17% completed25% completed33% completed42% completed50% completed58% completed67% completed75% completed83% completed92% completed100% completed: Done

Show code

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
new<-(Sys.time())
time_diff <- (Sys.time() - old)/60
paste0("The model took ",round(new-old,2)," minutes")

[1] "The model took 19.62 minutes"

Show code

model_array_ppio2 <- model_array
fitted_log_ratio_array_ppio <- fitted_log_ratio_array
bootstrap_log_ratio_array_ppio <- bootstrap_log_ratio_array
bootstrap_results_ppio <- bootstrap_results
p_value_array_ppio <- p_value_array

# Get the BIC values for all models in model_array_ppio2
bic_values <- sapply(model_array_ppio2, function(model) model$bic)

# Identify the index of the model with the lowest BIC
best_model_index <- which.min(bic_values)

# Select the best model
LCA_best_model_ppio <- model_array_ppio2[[best_model_index]]
#####################################################################################################################################################################
#Within poLCA, parameter estimates are obtained by a procedure that repeatedly improves estimates.
#This is stopped when no further improvements are obtained, or until a maximum number of iterations is reached. The starting values are the values at which such repetitions were started. Increasing the number 4 R. ACHTERHOF ET AL.of iterations (cycles within each estimation) and setting more different starting values for each repetition results in a greater likelihood that the global (rather than local) maximum of the log-likelihood function (and thus, the best possible solution) is reached. The maximum number of iterations was chosen as 10.000, and 500 different sets of starting values were used (thus going beyond the recommendations by Linzer & Lewis, 2011; Oberski, 2016). As such, the influence of chance was minimized while the reproducibility of the results was maximized#

Time for this code chunk to run: 0.3 minutes

Resultados

Hicimos un gráfico de los resultados

Show code

relative.entropy<-function(lc){
  en<--sum(lc$posterior*
             log(lc$posterior),na.rm=T)
  e<-1-en/(nrow(lc$posterior)*log(ncol(lc$posterior)))
  return(e)
}
machine_tolerance <- sqrt(.Machine$double.eps)
entropy.R2 <- function(fit) {
  entropy <- function(p) {
    p <- p[p > machine_tolerance] # since Lim_{p->0} p log(p) = 0
    sum(-p * log(p))
  }
  error_prior <- entropy(fit$P) # Class proportions
  error_post <- mean(apply(fit$posterior, 1, entropy))
  R2_entropy <- (error_prior - error_post) / error_prior
  R2_entropy
}

# "log-likelihood": The log-likelihood of the model is computed by mod$llik.
# "Chi2": The chi-squared test statistic for the model is computed by mod$Chisq.
# "Chi2_pval": The p-value for the chi-squared test is computed by mod$Chisq.pvalue.
# "resid. df": The residual degrees of freedom for the model is computed by mod$resid.df.
# "AIC" : Akaike Information Criterion computed by mod$aic.
# "BIC": The Bayesian Information Criterion for the model is computed by mod$bic.
# "aBIC": The adjusted Bayesian Information Criterion for the model is computed by (-2*mod$llik) + (log((mod$N + 2)/24) * mod$npar).
# "cAIC": The consistent Akaike Information Criterion for the model is computed by (-2*mod$llik) + mod$npar * (1 + log(mod$N)). Making AIC asymptotically coherent
# "likelihood-ratio": The likelihood-ratio test statistic for the model is computed by mod$Gsq.
# "LLik_pval": The p-value for the likelihood-ratio test is computed by mod$Gsq.pvalue.
# "Entropy": The entropy of the model, which is a measure of the quality of classification, is computed by the relative.entropy() function.
# "Entropy.R2": The proportion of variance in the data accounted for by the model entropy is computed by the entropy.R2() function.
# "Dev Change": The change in deviance (G-squared) from the previous model in model_array_ppio2 is computed by (mod_min1$Gsq - mod$Gsq).
# "df": The number of degrees of freedom for the model is computed by mod$C^mod$I - mod$npar - 1.
# "pval": The p-value for the deviance change test is computed by pchisq(mod_min1$Gsq - mod$Gsq, mod_min1$resid.df - mod$resid.df).
#  "BLRT": log-likelihood ratio between the null(nclass -1 ) and alternative models
#  "BLRT.pvalue": p-value for the og-likelihood ratio test between the null(nclass -1 ) and alternative models#654

# Initialize an empty data frame
tab_ppio <- data.frame()##

# Loop through each model
for (i in 2:n_class_max) {
  skip_to_next <- FALSE

  # Get the model and the previous model
  mod <- model_array_ppio2[[i]]
  mod_min1 <- model_array_ppio2[[(i-1)]]

  # Check if the model has valid predictions
  if (is.null(mod$predclass)) {
    skip_to_next <- TRUE
  }

  # If the model has valid predictions, calculate the measures and add them to the data frame
  if (!skip_to_next) {
    # Number of latent classes
    mod$C <- max(t(matrix(apply(mod$y, 2, max))))
    # Number of manifest variables
    mod$J <- ncol(mod$y)
    # Total number of items
    mod$I <- mod$J * mod$C
    # Degrees of freedom
    mod$df <- mod$C^mod$I - mod$npar - 1
    # Chi-square test
    mod$Chisq.pvalue <- (1 - pchisq(mod$Chisq, mod$df))
    # AIC
    mod$aic <- round(mod$aic, 2)
    # BIC
    mod$bic <- round(mod$bic, 2)
    # Adjusted BIC n*=(n+2)/24 (https://github.com/dlinzer/poLCA/issues/10)
    mod$aBIC <- round((-2 * mod$llik) + (log((mod$N+2)/24) * mod$npar), 2) 
    # Conditional AIC
    mod$cAIC <-  round((-2 * mod$llik) + (2 * mod$Nobs * log(mod$N/mod$Nobs)), 2)
    # approximate weight of evidence criterion  #https://jbds.isdsa.org/public/journals/1/html/v2n2/qiu/
    mod$awe <-  round((-2 * mod$llik) + (2 * mod$npar * log(mod$Nobs)+1.5), 2)    
    # Gsq: deviance  -2*(fit$llik) + 2*(fit$npar)*(log(fit$Nobs)+1.5)
    mod$Gsq
    # Likelihood ratio test
    mod$Gsq.pvalue <- (1 - pchisq(mod$Gsq, mod$df))
    # Relative entropy
    mod$RelEnt <- round(relative.entropy(mod), 2)
    # Entropy R-squared
    mod$EntR2 <- round(entropy.R2(mod), 2)
    # Deviance change
    mod$DevChange <- round(mod_min1$Gsq - mod$Gsq, 2)
    # Degrees of freedom change
    mod$dfChange <- mod_min1$resid.df - mod$resid.df
    # P-value for deviance change
    mod$pvalDevChange <- round(pchisq(mod$DevChange, mod$dfChange, lower.tail = FALSE), 4)
    mod$BLRT <- round(fitted_log_ratio_array_ppio[[i]],2)
    mod$BLRT.pvalue <- p_value_array_ppio[[i]]
    # Add the model index to the data frame
    mod$ModelIndex <- i
      # Check if the data.frame is empty or has the same number of columns as mod
      #if (nrow(tab_ppio) == 0 || ncol(tab_ppio) == ncol(mod)) {
      # Add the measures to the data frame
    tab_ppio <- rbind.data.frame(tab_ppio, t(data.matrix(mod[c("llik", "Chisq", "Chisq.pvalue", "resid.df", "aic", "bic", "aBIC", "cAIC", "awe", "Gsq", "Gsq.pvalue", "RelEnt", "EntR2", "DevChange", "dfChange", "pvalDevChange", "ModelIndex","BLRT", "BLRT.pvalue")])))

    #  } else {}
  }
}

# identify the list-like columns
list_cols <- sapply(tab_ppio, is.list)
# unlist the list-like columns
unlisted_cols <- lapply(tab_ppio[list_cols], unlist)
# bind the unlisted columns as a data frame
tab_ppio <- cbind(tab_ppio[!list_cols], do.call(cbind, unlisted_cols))
#Erase rownames
rownames(tab_ppio) <- NULL

manualcolors <- c('indianred1', 'cornflowerblue', 'gray50', 'darkolivegreen4', 'slateblue2', 
                  'firebrick4', 'goldenrod4')

levels <- c("llik", "Chisq", "Chisq.pvalue", "resid.df", "aic", "bic", "aBIC", "cAIC", "awe",
            "Gsq", "Gsq.pvalue", "RelEnt", "EntR2", "DevChange", "dfChange",
            "pvalDevChange", "BLRT", "BLRT.pvalue")

labels <- c('Log-Verosimilitud', 'Chi2', 'valor p Chi2', 'Grados de libertad', 
            'Criterio de Información\nde Akaike(AIC)'','Criterio de Información\nBayesiano (BIC)')''BIC Ajustado (SABIC)')'"AIC Corregido"o"'Peso de evidencia aproximado(awe)')'G-squared/Deviance'e''Valor p G-squared'd''Entropía Relativa'va'Entropía R2' R'Cambio en Deviance\n(con modelo previo)'io'Grados de libertad del cambio'bi'valor p cambio deviance'nc'BLRT'LR'valor p BLRT')RT')

fig_lca_fit<- tab_ppio %>%
  dplyr::mutate_if(is.character, as.numeric) %>%  # convert character columns to numeric
  tidyr::pivot_longer(cols = -ModelIndex, #"evryone but index"
                       names_to = "indices", values_to = "value", values_drop_na = F) %>%
  dplyr::mutate(indices = factor(indices, levels = levels, labels = labels)) %>%
  dplyr::filter(grepl("(AIC|BIC|awe)",indices, ignore.case=T))%>%
  dplyr::mutate(ModelIndex= factor(ModelIndex, levels=2:n_class_max)) %>% 
  ggplot(aes(x = ModelIndex, y = value, group = indices, color = indices, linetype = indices)) +
  geom_line(size = 1.5) +
  scale_color_manual(values = manualcolors) +
  #scale_linetype_manual(values = c("solid", "dashed", "dotted")) +
  labs(x = "Número de clases"", y=="Valor"", color=="Medida"", linetype=="Medida"))++
  #facet_wrap(.~indices, scales = "free_y", nrow = 4, ncol = 1) +
  theme_bw()

fig_lca_fit

Show code

ggsave("_fig1_comparison_sin_po.png",fig_lca_fit, dpi=600)
#International Journal of Workplace Health Management  (Zhang et al., 2018).

Time for this code chunk to run: 0.4 minutes

Luego en una tabla

Show code

tab_ppio %>%#
  dplyr::select(ModelIndex, everything()) %>% 
    dplyr::mutate_if(is.character, as.numeric) %>%  # convert character columns to numeric
    knitr::kable(format="markdown", caption="Fit measures of models")

Table 2: Fit measures of models
ModelIndex	llik	Chisq	Chisq.pvalue	resid.df	aic	bic	aBIC	cAIC	awe	Gsq	Gsq.pvalue	RelEnt	EntR2	DevChange	dfChange	pvalDevChange	BLRT
2	-28802.82	27886436.49	1	3736	57711.64	58042.35	57873.94	57605.64	58480.56	5225.544	1	0.99	0.98	2695.80	27	0e+00	2695.80
3	-28582.41	15802143.62	1	3709	57324.81	57824.00	57569.80	57164.81	58484.69	4784.722	1	0.90	0.88	440.82	27	0e+00	440.82
4	-28404.88	17040334.21	1	3682	57023.76	57691.43	57351.43	56809.76	58574.59	4429.669	1	0.92	0.90	355.05	27	0e+00	355.05
5	-28267.95	16604288.09	1	3655	56803.91	57640.05	57214.26	56535.91	58745.69	4155.814	1	0.93	0.91	273.86	27	0e+00	273.86
6	-28136.33	19915.70	1	3628	56594.65	57599.27	57087.69	56272.65	58927.39	3892.562	1	0.90	0.88	263.25	27	0e+00	263.25
7	-28044.70	21604.85	1	3601	56465.41	57638.50	57041.12	56089.41	59189.09	3709.314	1	0.80	0.78	183.25	27	0e+00	183.25
8	-27995.87	12682.33	1	3574	56421.73	57763.30	57080.13	55991.73	59536.37	3611.641	1	0.82	0.79	97.67	27	0e+00	97.67
9	-27950.10	11994.82	1	3547	56384.21	57894.25	57125.29	55900.21	59889.80	3520.116	1	0.87	0.85	91.52	27	0e+00	91.52
10	-27914.42	11882.20	1	3520	56366.85	58045.37	57190.62	55828.85	60263.39	3448.758	1	0.85	0.82	71.36	27	0e+00	71.36
11	-27884.76	11566.51	1	3493	56361.53	58208.53	57267.98	55769.53	60649.02	3389.436	1	0.78	0.74	59.32	27	3e-04	59.32
12	-27856.08	11016.74	1	3466	56358.16	58373.63	57347.29	55712.16	61036.60	3332.063	1	0.76	0.73	57.37	27	6e-04	57.37

Time for this code chunk to run: 0 minutes

Presentamos el modelo con mejor ajuste

Show code

print(model_array_ppio[best_model_index]) #

[[1]]
Conditional item response (column) probabilities,
 by outcome variable, for each class (row) 
 
$AÑO
          Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)
class 1:      0 0.1173 0.2375 0.1863 0.2622 0.1968
class 2:      0 0.1369 0.1738 0.2442 0.1217 0.3234
class 3:      0 0.1329 0.2480 0.1763 0.2409 0.2019
class 4:      0 0.1581 0.2170 0.1846 0.2419 0.1984
class 5:      0 0.1783 0.1892 0.2013 0.1921 0.2391
class 6:      0 1.0000 0.0000 0.0000 0.0000 0.0000

$CAUSAL
          Pr(1)  Pr(2)  Pr(3)  Pr(4)
class 1:      0 0.4486 0.5514 0.0000
class 2:      0 0.0515 0.0035 0.9450
class 3:      0 0.3861 0.6139 0.0000
class 4:      0 0.1871 0.8129 0.0000
class 5:      0 0.0051 0.0000 0.9949
class 6:      0 0.5888 0.4112 0.0000

$EDAD_MUJER_REC
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)
class 1:  0.0000 0.0090 0.1665 0.5034 0.3212
class 2:  0.0000 0.1603 0.2310 0.4516 0.1571
class 3:  0.0000 0.0166 0.2102 0.4657 0.3076
class 4:  0.0000 0.0000 0.0375 0.4238 0.5387
class 5:  0.0017 0.3483 0.2461 0.2951 0.1088
class 6:  0.0747 0.0259 0.2019 0.3966 0.3008

$PAIS_ORIGEN_REC
           Pr(1)  Pr(2)  Pr(3)
class 1:  0.0000 0.0000 1.0000
class 2:  0.0000 0.0000 1.0000
class 3:  0.0000 1.0000 0.0000
class 4:  0.0000 0.9242 0.0758
class 5:  0.0000 0.9945 0.0055
class 6:  0.0791 0.8170 0.1038

$HITO1_EDAD_GEST_SEM_REC
           Pr(1)  Pr(2)  Pr(3)  Pr(4)
class 1:  0.0221 0.1076 0.7594 0.1110
class 2:  0.0194 0.9806 0.0000 0.0000
class 3:  0.0278 0.1860 0.6825 0.1037
class 4:  0.0145 0.3711 0.5646 0.0498
class 5:  0.0088 0.9877 0.0035 0.0000
class 6:  0.0394 0.0956 0.4469 0.4181

$MACROZONA
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)
class 1:  0.0000 0.5963 0.0860 0.0615 0.2119 0.0443
class 2:  0.0060 0.5503 0.0886 0.0175 0.2918 0.0459
class 3:  0.0000 0.3373 0.1760 0.2026 0.0966 0.1875
class 4:  0.0000 0.7213 0.0921 0.0522 0.0691 0.0654
class 5:  0.0036 0.3851 0.1750 0.1452 0.0862 0.2049
class 6:  0.0350 0.2252 0.3401 0.1308 0.0919 0.1771

$PREV_TRAMO_REC
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)
class 1:  0.0013 0.0127 0.6112 0.2984 0.0763
class 2:  0.0291 0.0066 0.5242 0.1646 0.2755
class 3:  0.0006 0.0289 0.6156 0.3550 0.0000
class 4:  0.0000 0.8325 0.0000 0.1597 0.0078
class 5:  0.0018 0.0696 0.7239 0.1995 0.0053
class 6:  0.0282 0.0254 0.6133 0.2807 0.0524

Estimated class population shares 
 0.1192 0.0441 0.5041 0.1205 0.152 0.06 
 
Predicted class memberships (by modal posterior prob.) 
 0.1219 0.0435 0.5199 0.1143 0.1507 0.0496 
 
========================================================= 
Fit for 6 latent classes: 
========================================================= 
number of observations: 3789 
number of estimated parameters: 161 
residual degrees of freedom: 3628 
maximum log-likelihood: -28136.33 
 
AIC(6): 56594.65
BIC(6): 57599.27
G^2(6): 3892.562 (Likelihood ratio/deviance statistic) 
X^2(6): 19915.7 (Chi-square goodness of fit)

Time for this code chunk to run: 0 minutes

Show code

print(Sys.time())

[1] "2023-05-13 05:57:56 -04"

Show code

save.image("data2_lca2_sin_po.RData")

Time for this code chunk to run: 0 minutes

Show code

require(tidyverse)
sesion_info <- devtools::session_info()
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
) %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Variable' = 2, 'Percentage'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '50%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",#;
        "}")))

Time for this code chunk to run: 0.3 minutes

Paso 2.9

Análisis de clases latentes

Especificar modelo

Exploración

Árboles aleatorios

Recodificación datos

Análisis

Resultados