Data Exploration

From Multilevel Modeling to GEE: Revisiting the Within- and Between-Person Debate with Binary Predictors and Outcomes

Author

Affiliation

Ward B. Eiling

Utrecht University

Published

May 12, 2024

This document allows for the exploration of the four data generating mechanisms underlying the simulation study of the manuscript “From Multilevel Modeling to GEE: Revisiting the Within- and Between-Person Debate with Binary Predictors and Outcomes”. The parameters have the following meaning

N_total: the total number of individuals in the dataset
T_total: the total number of time points in the dataset
predictor.type: the type of predictor variable, either “continuous” or “binary”
outcome.type: the type of outcome variable, either “continuous” or “binary”
sdX.within: the standard deviation of the within-person predictor variable
sdX.between: the standard deviation of the between-person predictor variable (for continuous X; for binary X it is SD of the latent variable underlying the person-specific propensity)
g.00: the intercept of the outcome variable
g.01: the contextual slope (under mundlak’s contextual method)
sd.u0: the standard deviation of the random intercept residual
g.10: the within-person person slope (\(\beta_1\) in the manuscript)
sd.e: the standard deviation of the level 1 residual error
true_cluster_means: whether to use the true cluster means or the estimated cluster means in the model
X.mean.j: the true mean of the predictor variable for each individual (continuous predictors)
p.X.mean.j: the true underlying propensity of the predictor variable for each individual (binary predictors)
X.cluster.means: the estimated mean of the predictor variable for each individual
p_Y: the propensity on the outcome variable for each individual
eta: the linear predictor of the outcome variable
b0: the person-specific random intercept of the outcome variable

Code

# load libraries
library(ggplot2)
library(ggpubr)
library(lme4)
library(here)
library(tidyr)
library(dplyr)

# source the helper function
# source(here("scripts/hamaker2020_extended/helper-functions/data-generation-centeredX.R"))
source(here("scripts/helper-functions/data-generation-mundlak.R"))

1 DGM 1: Continuous X and Y

Code

data_cont <- glmm_data_generation(N_total = 1000, T_total = 20, predictor.type = "continuous", outcome.type = "continuous", sdX.within = 1, sdX.between = 3, g.00 = 0, g.01 = 3, sd.u0 = 1, g.10 = 1.5, sd.u1 = 0, sd.e = 1, true_cluster_means = FALSE)
summary(data_cont)

    Cluster            Time          X.mean.j              b0           
 Min.   :   1.0   Min.   : 1.00   Min.   :-8.73802   Min.   :-26.00655  
 1st Qu.: 250.8   1st Qu.: 5.75   1st Qu.:-1.96944   1st Qu.: -6.20499  
 Median : 500.5   Median :10.50   Median : 0.01544   Median :  0.36694  
 Mean   : 500.5   Mean   :10.50   Mean   : 0.01306   Mean   :  0.06969  
 3rd Qu.: 750.2   3rd Qu.:15.25   3rd Qu.: 2.00223   3rd Qu.:  5.93257  
 Max.   :1000.0   Max.   :20.00   Max.   :10.61089   Max.   : 31.31880  
       b1      p.X.mean.j           X                   eta          
 Min.   :1.5   Mode:logical   Min.   :-11.297146   Min.   :-42.3002  
 1st Qu.:1.5   NA's:20000     1st Qu.: -2.108913   1st Qu.: -9.1473  
 Median :1.5                  Median :  0.038750   Median :  0.4547  
 Mean   :1.5                  Mean   :  0.008669   Mean   :  0.0827  
 3rd Qu.:1.5                  3rd Qu.:  2.091453   3rd Qu.:  8.9911  
 Max.   :1.5                  Max.   : 13.868826   Max.   : 52.1220  
       Y               p.Y          X.cluster.means         X.cent         
 Min.   :-42.64211   Mode:logical   Min.   :-9.096363   Min.   :-4.117487  
 1st Qu.: -9.14313   NA's:20000     1st Qu.:-2.012894   1st Qu.:-0.654894  
 Median :  0.43707                  Median : 0.066049   Median : 0.004177  
 Mean   :  0.06847                  Mean   : 0.008669   Mean   : 0.000000  
 3rd Qu.:  8.95112                  3rd Qu.: 1.977549   3rd Qu.: 0.664019  
 Max.   : 50.61548                  Max.   :10.726856   Max.   : 4.231228

Code

fixef(lmer(Y ~ X.cent + X.cluster.means + (1 | Cluster), data = data_cont))

    (Intercept)          X.cent X.cluster.means 
     0.02967602      1.51076595      4.47522254

Code

# 1.1 check distributions of X and Y
p1.11 <- ggplot(data_cont, aes(x = Y)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of Y") +
  geom_vline(xintercept = mean(data_cont$Y), linetype = "dashed", color = "red")
p1.12 <- ggplot(data_cont, aes(x = X)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of X") +
  geom_vline(xintercept = mean(data_cont$X), linetype = "dashed", color = "red")

# 1.2 check histogram of eta
p1.2 <- ggplot(data_cont, aes(x = eta)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of eta") +
  geom_vline(xintercept = mean(data_cont$eta), linetype = "dashed", color = "red")

# combine plots
gridExtra::grid.arrange(p1.11, p1.12, p1.2, ncol = 2)

Code

# Reduce to only cluster level variables
data_cont_level2 <- data_cont %>% select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%
  distinct(Cluster, .keep_all = TRUE)

# 1.4 scatterplot of b0 and eta and X.mean.j and X.cluster.means
p1.31 <- ggplot(data_cont_level2, aes(x = b0, y = eta)) + geom_point() + labs(title = "Scatterplot of b0 and eta") + stat_cor(method = "pearson", digits = 3)
p1.32 <- ggplot(data_cont_level2, aes(x = X.mean.j, y = X.cluster.means)) + geom_point() + labs(title = "Scatterplot of X.mean.j and X.cluster.means", x = "True cluster mean", y = "Estimated cluster mean") + stat_cor(method = "pearson", digits = 3)

# combine plots
gridExtra::grid.arrange(p1.31, p1.32, ncol = 2)

Code

# 1.3 overlaying density plots for X.cluster.means and X.mean.j
p1.41 <- ggplot(data_cont_level2, aes(x = X.cluster.means, fill = "X.cluster.means")) + geom_density(alpha = 0.5) +
  geom_density(aes(x = X.mean.j, fill = "X.mean.j"), alpha = 0.5) + labs(title = "Density plot of X.cluster.means and X.mean.j")

# compute difference between estimated and true cluster mean for each cluster
data_cont_level2$diff <- data_cont_level2$X.cluster.means - data_cont_level2$X.mean.j

# plot histogram of differences
p1.42 <- ggplot(data_cont_level2, aes(x = diff)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of differences between estimated and true cluster mean") +
  geom_vline(xintercept = mean(data_cont$diff), linetype = "dashed", color = "red")

# combine plots
gridExtra::grid.arrange(p1.41, p1.42, nrow = 2)

2 DGM 2: Binary X and Continuous Y

Code

### 2 generate binary X and continuous Y
data_binx_conty <- glmm_data_generation(N_total = 1000, T_total = 20, predictor.type = "binary", outcome.type = "continuous", sdX.within = NA, sdX.between = 3, g.00 = 0, g.01 = 3, sd.u0 = 1, g.10 = 1.5, sd.u1 = 0, sd.e = 1, true_cluster_means = FALSE)
# same values as (1) but without sdX.within
summary(data_binx_conty)

    Cluster            Time          X.mean.j              b0         
 Min.   :   1.0   Min.   : 1.00   Min.   :-10.7903   Min.   :-3.1207  
 1st Qu.: 250.8   1st Qu.: 5.75   1st Qu.: -2.1764   1st Qu.: 0.3576  
 Median : 500.5   Median :10.50   Median : -0.1994   Median : 1.3997  
 Mean   : 500.5   Mean   :10.50   Mean   : -0.1932   Mean   : 1.4160  
 3rd Qu.: 750.2   3rd Qu.:15.25   3rd Qu.:  1.7651   3rd Qu.: 2.4190  
 Max.   :1000.0   Max.   :20.00   Max.   :  9.9750   Max.   : 4.9124  
       b1        p.X.mean.j              X               eta         
 Min.   :1.5   Min.   :0.0000206   Min.   :0.0000   Min.   :-3.1207  
 1st Qu.:1.5   1st Qu.:0.1018935   1st Qu.:0.0000   1st Qu.: 0.5574  
 Median :1.5   Median :0.4503058   Median :0.0000   Median : 2.0724  
 Mean   :1.5   Mean   :0.4770429   Mean   :0.4813   Mean   : 2.1378  
 3rd Qu.:1.5   3rd Qu.:0.8538363   3rd Qu.:1.0000   3rd Qu.: 3.6970  
 Max.   :1.5   Max.   :0.9999535   Max.   :1.0000   Max.   : 6.4124  
       Y             p.Y          X.cluster.means      X.cent     
 Min.   :-5.4635   Mode:logical   Min.   :0.0000   Min.   :-0.95  
 1st Qu.: 0.4949   NA's:20000     1st Qu.:0.1000   1st Qu.:-0.15  
 Median : 2.1103                  Median :0.4500   Median : 0.00  
 Mean   : 2.1345                  Mean   :0.4813   Mean   : 0.00  
 3rd Qu.: 3.7888                  3rd Qu.:0.8500   3rd Qu.: 0.15  
 Max.   : 8.8697                  Max.   :1.0000   Max.   : 0.95

Code

fixef(lmer(Y ~ X.cent + X.cluster.means + (1 | Cluster), data = data_binx_conty))

    (Intercept)          X.cent X.cluster.means 
     0.07691846      1.51889348      4.27556551

Code

# 2.1 check distributions of X and Y
p2.11 <- ggplot(data_binx_conty, aes(x = Y)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of Y") +
  geom_vline(xintercept = mean(data_binx_conty$Y), linetype = "dashed", color = "red")
p2.12 <- ggplot(data_binx_conty, aes(x = X)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of X") +
  geom_vline(xintercept = mean(data_binx_conty$X), linetype = "dashed", color = "red")

# Reduce to only cluster level variables
data_binx_conty_level2 <- data_binx_conty %>% select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%
  distinct(Cluster, .keep_all = TRUE)

# 2.2 histograms X.mean.j, p.X.mean.j and eta
p2.21 <- ggplot(data_binx_conty_level2, aes(x = X.mean.j)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of X.mean.j") +
  geom_vline(xintercept = mean(data_binx_conty$X.mean.j), linetype = "dashed", color = "red")
p2.22 <- ggplot(data_binx_conty_level2, aes(x = p.X.mean.j)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of p.X.mean.j") +
  geom_vline(xintercept = mean(data_binx_conty$p.X.mean.j), linetype = "dashed", color = "red")
p2.23 <- ggplot(data_binx_conty_level2, aes(x = eta)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of eta") +
  geom_vline(xintercept = mean(data_binx_conty$eta), linetype = "dashed", color = "red")
# eta is nicely distributed with -3 to 3 range

# when sdX.between is large (e.g, sqrt(4)) the distribution of probabilities 
# of X and Y are U/bowl shaped (higher density at the edges)
# when sdX.between is small (e.g., sqrt(0.4)) the distribution of probabilities
# of X and Y are inverted U shaped (higher density in the center)

# combine plots
gridExtra::grid.arrange(p2.11, p2.12, p2.21, p2.22, p2.23, ncol = 2)

Code

# 2.3 scatterplot of p.X.mean.j and b0, b0 and eta, estimated cluster mean and p.X.mean.j
p2.31 <- ggplot(data_binx_conty_level2, aes(x = p.X.mean.j, y = b0)) + geom_point() + labs(title = "Scatterplot of p.X.mean.j and b0") + stat_cor(method = "pearson", digits = 3)

p2.32 <- ggplot(data_binx_conty_level2, aes(x = b0, y = eta)) + geom_point() + labs(title = "Scatterplot of b0 and eta") + stat_cor(method = "pearson", digits = 3)
p2.33 <- ggplot(data_binx_conty_level2, aes(x = p.X.mean.j, y = X.cluster.means)) + geom_point() + labs(title = "Scatterplot of X.cluster.means and p.X.mean.j", x = "True underlying probability", y = "Observed proportion") + stat_cor(method = "pearson", digits = 3)

# combine plots
gridExtra::grid.arrange(p2.31, p2.32, p2.33, ncol = 2)

Code

# create overlapping density plots for X.cluster.means and p.X.mean.j
p2.41 <- ggplot(data_binx_conty_level2, aes(x = X.cluster.means, fill = "X.cluster.means")) + geom_histogram(alpha = 0.8, binwidth = 0.02) +
  geom_histogram(aes(x = p.X.mean.j, fill = "p.X.mean.j"), alpha = 0.8, binwidth = 0.02) + labs(title = "Density plot of X.cluster.means and p.X.mean.j", x = "value")

# compute difference between estimated and true cluster mean for each cluster
data_binx_conty_level2$diff <- data_binx_conty_level2$X.cluster.means - data_binx_conty_level2$p.X.mean.j

# plot histogram of differences
p2.42 <- ggplot(data_binx_conty_level2, aes(x = diff)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of differences between estimated and true cluster mean") +
  geom_vline(xintercept = mean(data_binx_conty_level2$diff, na.rm = TRUE), linetype = "dashed", color = "red")

# combine plots
gridExtra::grid.arrange(p2.41, p2.42, nrow = 2)

3 DGM 3: Continuous X and Binary Y

Code

### 3 generate continuous X and binary Y ###
data_contx_biny <- glmm_data_generation(N_total = 1000, T_total = 20, predictor.type = "continuous", outcome.type = "binary",
                                        sdX.within = 1, sdX.between = 3, g.00 = 0, g.01 = 3, sd.u0 = 1, g.10 = 1.5, sd.u1 = 0, sd.e = NA, true_cluster_means = FALSE)

summary(data_contx_biny)

    Cluster            Time          X.mean.j              b0          
 Min.   :   1.0   Min.   : 1.00   Min.   :-9.07807   Min.   :-28.1225  
 1st Qu.: 250.8   1st Qu.: 5.75   1st Qu.:-1.98921   1st Qu.: -6.2389  
 Median : 500.5   Median :10.50   Median : 0.03053   Median :  0.1808  
 Mean   : 500.5   Mean   :10.50   Mean   : 0.08772   Mean   :  0.2572  
 3rd Qu.: 750.2   3rd Qu.:15.25   3rd Qu.: 2.19022   3rd Qu.:  6.7083  
 Max.   :1000.0   Max.   :20.00   Max.   :10.76135   Max.   : 32.4569  
       b1      p.X.mean.j           X                  eta          
 Min.   :1.5   Mode:logical   Min.   :-10.61623   Min.   :-44.0469  
 1st Qu.:1.5   NA's:20000     1st Qu.: -2.04541   1st Qu.: -8.9802  
 Median :1.5                  Median :  0.05029   Median :  0.1885  
 Mean   :1.5                  Mean   :  0.11174   Mean   :  0.4248  
 3rd Qu.:1.5                  3rd Qu.:  2.27474   3rd Qu.:  9.9190  
 Max.   :1.5                  Max.   : 12.49205   Max.   : 50.7784  
       Y              p.Y            X.cluster.means       X.cent         
 Min.   :0.000   Min.   :0.0000000   Min.   :-9.3529   Min.   :-3.568000  
 1st Qu.:0.000   1st Qu.:0.0001259   1st Qu.:-1.9292   1st Qu.:-0.658154  
 Median :1.000   Median :0.5469822   Median : 0.0441   Median :-0.002651  
 Mean   :0.505   Mean   :0.5055416   Mean   : 0.1117   Mean   : 0.000000  
 3rd Qu.:1.000   3rd Qu.:0.9999508   3rd Qu.: 2.1982   3rd Qu.: 0.648543  
 Max.   :1.000   Max.   :1.0000000   Max.   :10.7529   Max.   : 3.689867

Code

fixef(glmer(Y ~ X.cent + X.cluster.means + (1 | Cluster), data = data_contx_biny, family = binomial(link = "logit")))

    (Intercept)          X.cent X.cluster.means 
    -0.03122169      1.49246514      4.54143217

Code

# 3.1 check distributions of X and Y
p3.11 <- ggplot(data_contx_biny, aes(x = Y)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of Y") +
  geom_vline(xintercept = mean(data_contx_biny$Y), linetype = "dashed", color = "red")
p3.12 <- ggplot(data_contx_biny, aes(x = X)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of X") +
  geom_vline(xintercept = mean(data_contx_biny$X), linetype = "dashed", color = "red")

# only keep level 2 variables
data_contx_biny_level2 <- data_contx_biny %>% select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%
  distinct(Cluster, .keep_all = TRUE)

# 3.2 check distributions of eta and p_y
p3.21 <- ggplot(data_contx_biny_level2, aes(x = eta)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of eta") +
  geom_vline(xintercept = mean(data_contx_biny$eta), linetype = "dashed", color = "red")
p3.22 <- ggplot(data_contx_biny, aes(x = p.Y)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of p_y") +
  geom_vline(xintercept = mean(data_contx_biny$p.Y), linetype = "dashed", color = "red")

# combine plots
gridExtra::grid.arrange(p3.11, p3.12, p3.21, p3.22, ncol = 2)

Code

# 3.3 scatterplot of b0 and eta and X.mean.j and X.cluster.means
p3.31 <- ggplot(data_contx_biny, aes(x = b0, y = eta)) + geom_point() + labs(title = "Scatterplot of b0 and eta") + stat_cor(method = "pearson", digits = 3)
p3.32 <- ggplot(data_contx_biny, aes(x = X.mean.j, y = X.cluster.means)) + geom_point() + labs(title = "Scatterplot of X.mean.j and X.cluster.means", x = "True cluster mean", y = "Estimated cluster mean") + stat_cor(method = "pearson", digits = 3)

# combine plots
gridExtra::grid.arrange(p3.31, p3.32, ncol = 2)

Code

# create overlapping density plots for X.cluster.means and p.X.mean.j
p3.41 <- ggplot(data_contx_biny_level2, aes(x = X.cluster.means, fill = "X.cluster.means")) + geom_density(alpha = 0.5) +
  geom_density(aes(x = X.mean.j, fill = "p.X.mean.j"), alpha = 0.5) + labs(title = "Density plot of X.cluster.means and X.mean.j", x = "value")

# compute difference between estimated and true cluster mean for each cluster
data_contx_biny_level2$diff <- data_contx_biny_level2$X.cluster.means - data_contx_biny_level2$X.mean.j

# plot histogram of differences
p3.42 <- ggplot(data_contx_biny_level2, aes(x = diff)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of differences between estimated and true cluster mean") +
  geom_vline(xintercept = mean(data_contx_biny_level2$diff), linetype = "dashed", color = "red")

# combine plots
gridExtra::grid.arrange(p3.41, p3.42, nrow = 2)

4 DGM 4: Binary X and Y

Code

# 4 generate binary X and Y
data_binary <- glmm_data_generation(N_total = 1000, T_total = 20, predictor.type = "binary", outcome.type = "binary",
                                    sdX.within = NA, sdX.between = 3, g.00 = 0, g.01 = 3, sd.u0 = 1, g.10 = 1.5, sd.u1 = 0, sd.e = NA, true_cluster_means = FALSE)

summary(data_binary)

    Cluster            Time          X.mean.j              b0         
 Min.   :   1.0   Min.   : 1.00   Min.   :-9.08314   Min.   :-2.3044  
 1st Qu.: 250.8   1st Qu.: 5.75   1st Qu.:-2.03175   1st Qu.: 0.4382  
 Median : 500.5   Median :10.50   Median :-0.01095   Median : 1.4624  
 Mean   : 500.5   Mean   :10.50   Mean   :-0.01244   Mean   : 1.4754  
 3rd Qu.: 750.2   3rd Qu.:15.25   3rd Qu.: 1.98748   3rd Qu.: 2.4943  
 Max.   :1000.0   Max.   :20.00   Max.   : 9.51789   Max.   : 5.3999  
       b1        p.X.mean.j              X               eta        
 Min.   :1.5   Min.   :0.0001136   Min.   :0.0000   Min.   :-2.304  
 1st Qu.:1.5   1st Qu.:0.1159099   1st Qu.:0.0000   1st Qu.: 0.629  
 Median :1.5   Median :0.4972631   Median :0.0000   Median : 2.128  
 Mean   :1.5   Mean   :0.4943383   Mean   :0.4939   Mean   : 2.216  
 3rd Qu.:1.5   3rd Qu.:0.8794748   3rd Qu.:1.0000   3rd Qu.: 3.807  
 Max.   :1.5   Max.   :0.9999265   Max.   :1.0000   Max.   : 6.900  
       Y              p.Y          X.cluster.means      X.cent     
 Min.   :0.000   Min.   :0.09076   Min.   :0.0000   Min.   :-0.95  
 1st Qu.:1.000   1st Qu.:0.65226   1st Qu.:0.1000   1st Qu.:-0.10  
 Median :1.000   Median :0.89358   Median :0.5000   Median : 0.00  
 Mean   :0.792   Mean   :0.79295   Mean   :0.4939   Mean   : 0.00  
 3rd Qu.:1.000   3rd Qu.:0.97828   3rd Qu.:0.9000   3rd Qu.: 0.10  
 Max.   :1.000   Max.   :0.99899   Max.   :1.0000   Max.   : 0.95

Code

fixef(glmer(Y ~ X.cent + X.cluster.means + (1 | Cluster), data = data_binary, family = binomial(link = "logit")))

    (Intercept)          X.cent X.cluster.means 
       0.121757        1.562588        4.202865

Code

# 4.1 check distributions of X and Y
p4.11 <- ggplot(data_binary, aes(x = Y)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of Y") +
  geom_vline(xintercept = mean(data_binary$Y), linetype = "dashed", color = "red")
p4.12 <- ggplot(data_binary, aes(x = X)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of X") +
  geom_vline(xintercept = mean(data_binary$X), linetype = "dashed", color = "red")

# only keep level 2 variables
data_binary_level2 <- data_binary %>% select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%
  distinct(Cluster, .keep_all = TRUE)

# 4.2 histograms with density overlay for evaluation of probability distributions
p4.21 <- ggplot(data_binary_level2, aes(x = eta)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of eta") +
  geom_vline(xintercept = mean(data_binary$eta), linetype = "dashed", color = "red")
p4.22 <- ggplot(data_binary_level2, aes(x = p.X.mean.j)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of p.X.mean.j") +
  geom_vline(xintercept = mean(data_binary$p.X.mean.j), linetype = "dashed", color = "red")
p4.23 <- ggplot(data_binary, aes(x = p.Y)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of p_y") +
  geom_vline(xintercept = mean(data_binary$p.Y), linetype = "dashed", color = "red")

# when sdX.between is large (e.g, sqrt(4)) the distribution of probabilities 
# of X and Y are U/bowl shaped (higher density at the edges)
# when sdX.between is small (e.g., sqrt(0.4)) the distribution of probabilities 
# of X and Y are inverted U shaped (higher density in the center)
# when sdX.between is moderate (e.g., sqrt(2)) the distribution of probabilities
# is more uniform but inverted U shaped for X and slightly U shaped for Y.

# combine plots
gridExtra::grid.arrange(p4.11, p4.12, p4.21, p4.22, p4.23, ncol = 2)

Code

# 4.3 scatterplots for evaluation of relationship between probabilities and originating variables

# between p.X.mean.j and b0
p4.31 <- ggplot(data_binary_level2, aes(x = p.X.mean.j, y = b0)) + geom_point() + labs(title = "Scatterplot of p.X.mean.j and b0") + stat_cor(method = "pearson", digits = 3)

# between b0 and eta
p4.32 <- ggplot(data_binary_level2, aes(x = b0, y = eta)) + geom_point() + labs(title = "Scatterplot of b0 and eta") + stat_cor(method = "pearson", digits = 3)

# between X.cluster.means and p.X.mean.j
p4.33 <- ggplot(data_binary_level2, aes(x = p.X.mean.j, y = X.cluster.means)) + geom_point() + labs(title = "Scatterplot of X.cluster.means and p.X.mean.j", x = "True underlying probability", y = "Observed proportion") + stat_cor(method = "pearson", digits = 3)

# combine plots
gridExtra::grid.arrange(p4.31, p4.32, p4.33, ncol = 2)

Code

# Other: plots below only really indicate logit relationship
# ggplot(data_binary, aes(x = X.mean.j, y = p.X.mean.j)) + geom_point() + labs(title = "Scatterplot of X.mean.j and p.X.mean.j")
# As expected, we see a a sigmoid curve representing the logit relationship between X.mean.j 
# and p.X.mean.j. It is more pronounced (different from linear) when sdX.between is large.

# ggplot(data_binary, aes(x = eta, y = p.Y)) + geom_point() + labs(title = "Scatterplot of eta and p_y")
# As expected again, we see a sigmoid curve representing the logit relationship between eta
# and p_y. It is more pronounced (different from linear) when sdX.between and sd.u0 are large.

# create overlapping density plots for X.cluster.means and p.X.mean.j
p4.41 <- ggplot(data_binary_level2, aes(x = X.cluster.means, fill = "X.cluster.means")) + geom_histogram(alpha = 0.8, binwidth = 0.02) +
  geom_histogram(aes(x = p.X.mean.j, fill = "p.X.mean.j"), alpha = 0.8) + labs(title = "Density plot of X.cluster.means and p.X.mean.j", x = "value")

# compute difference between estimated and true cluster mean for each cluster
data_binary_level2$diff <- data_binary_level2$X.cluster.means - data_binary_level2$p.X.mean.j

# plot histogram of differences
p4.42 <- ggplot(data_binary_level2, aes(x = diff)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.5, fill = "blue") + labs(title = "Histogram of differences between estimated and true cluster mean") +
  geom_vline(xintercept = mean(data_binary_level2$diff), linetype = "dashed", color = "red")

# combine plots
gridExtra::grid.arrange(p4.41, p4.42, nrow = 2)