From Multilevel Modeling to GEE: Revisiting the Within- and Between-Person Debate with Binary Predictors and Outcomes
Author
Affiliation
Ward B. Eiling
Utrecht University
Published
May 12, 2024
This document allows for the exploration of the four data generating mechanisms underlying the simulation study of the manuscript “From Multilevel Modeling to GEE: Revisiting the Within- and Between-Person Debate with Binary Predictors and Outcomes”. The parameters have the following meaning
N_total: the total number of individuals in the dataset
T_total: the total number of time points in the dataset
predictor.type: the type of predictor variable, either “continuous” or “binary”
outcome.type: the type of outcome variable, either “continuous” or “binary”
sdX.within: the standard deviation of the within-person predictor variable
sdX.between: the standard deviation of the between-person predictor variable (for continuous X; for binary X it is SD of the latent variable underlying the person-specific propensity)
g.00: the intercept of the outcome variable
g.01: the contextual slope (under mundlak’s contextual method)
sd.u0: the standard deviation of the random intercept residual
g.10: the within-person person slope (\(\beta_1\) in the manuscript)
sd.e: the standard deviation of the level 1 residual error
true_cluster_means: whether to use the true cluster means or the estimated cluster means in the model
X.mean.j: the true mean of the predictor variable for each individual (continuous predictors)
p.X.mean.j: the true underlying propensity of the predictor variable for each individual (binary predictors)
X.cluster.means: the estimated mean of the predictor variable for each individual
p_Y: the propensity on the outcome variable for each individual
eta: the linear predictor of the outcome variable
b0: the person-specific random intercept of the outcome variable
Code
# load librarieslibrary(ggplot2)library(ggpubr)library(lme4)library(here)library(tidyr)library(dplyr)# source the helper function# source(here("scripts/hamaker2020_extended/helper-functions/data-generation-centeredX.R"))source(here("scripts/helper-functions/data-generation-mundlak.R"))
# 1.1 check distributions of X and Yp1.11<-ggplot(data_cont, aes(x = Y)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of Y") +geom_vline(xintercept =mean(data_cont$Y), linetype ="dashed", color ="red")p1.12<-ggplot(data_cont, aes(x = X)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of X") +geom_vline(xintercept =mean(data_cont$X), linetype ="dashed", color ="red")# 1.2 check histogram of etap1.2<-ggplot(data_cont, aes(x = eta)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of eta") +geom_vline(xintercept =mean(data_cont$eta), linetype ="dashed", color ="red")# combine plotsgridExtra::grid.arrange(p1.11, p1.12, p1.2, ncol =2)
Code
# Reduce to only cluster level variablesdata_cont_level2 <- data_cont %>%select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%distinct(Cluster, .keep_all =TRUE)# 1.4 scatterplot of b0 and eta and X.mean.j and X.cluster.meansp1.31<-ggplot(data_cont_level2, aes(x = b0, y = eta)) +geom_point() +labs(title ="Scatterplot of b0 and eta") +stat_cor(method ="pearson", digits =3)p1.32<-ggplot(data_cont_level2, aes(x = X.mean.j, y = X.cluster.means)) +geom_point() +labs(title ="Scatterplot of X.mean.j and X.cluster.means", x ="True cluster mean", y ="Estimated cluster mean") +stat_cor(method ="pearson", digits =3)# combine plotsgridExtra::grid.arrange(p1.31, p1.32, ncol =2)
Code
# 1.3 overlaying density plots for X.cluster.means and X.mean.jp1.41<-ggplot(data_cont_level2, aes(x = X.cluster.means, fill ="X.cluster.means")) +geom_density(alpha =0.5) +geom_density(aes(x = X.mean.j, fill ="X.mean.j"), alpha =0.5) +labs(title ="Density plot of X.cluster.means and X.mean.j")# compute difference between estimated and true cluster mean for each clusterdata_cont_level2$diff <- data_cont_level2$X.cluster.means - data_cont_level2$X.mean.j# plot histogram of differencesp1.42<-ggplot(data_cont_level2, aes(x = diff)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of differences between estimated and true cluster mean") +geom_vline(xintercept =mean(data_cont$diff), linetype ="dashed", color ="red")# combine plotsgridExtra::grid.arrange(p1.41, p1.42, nrow =2)
2 DGM 2: Binary X and Continuous Y
Code
### 2 generate binary X and continuous Ydata_binx_conty <-glmm_data_generation(N_total =1000, T_total =20, predictor.type ="binary", outcome.type ="continuous", sdX.within =NA, sdX.between =3, g.00 =0, g.01 =3, sd.u0 =1, g.10 =1.5, sd.u1 =0, sd.e =1, true_cluster_means =FALSE)# same values as (1) but without sdX.withinsummary(data_binx_conty)
Cluster Time X.mean.j b0
Min. : 1.0 Min. : 1.00 Min. :-10.7903 Min. :-3.1207
1st Qu.: 250.8 1st Qu.: 5.75 1st Qu.: -2.1764 1st Qu.: 0.3576
Median : 500.5 Median :10.50 Median : -0.1994 Median : 1.3997
Mean : 500.5 Mean :10.50 Mean : -0.1932 Mean : 1.4160
3rd Qu.: 750.2 3rd Qu.:15.25 3rd Qu.: 1.7651 3rd Qu.: 2.4190
Max. :1000.0 Max. :20.00 Max. : 9.9750 Max. : 4.9124
b1 p.X.mean.j X eta
Min. :1.5 Min. :0.0000206 Min. :0.0000 Min. :-3.1207
1st Qu.:1.5 1st Qu.:0.1018935 1st Qu.:0.0000 1st Qu.: 0.5574
Median :1.5 Median :0.4503058 Median :0.0000 Median : 2.0724
Mean :1.5 Mean :0.4770429 Mean :0.4813 Mean : 2.1378
3rd Qu.:1.5 3rd Qu.:0.8538363 3rd Qu.:1.0000 3rd Qu.: 3.6970
Max. :1.5 Max. :0.9999535 Max. :1.0000 Max. : 6.4124
Y p.Y X.cluster.means X.cent
Min. :-5.4635 Mode:logical Min. :0.0000 Min. :-0.95
1st Qu.: 0.4949 NA's:20000 1st Qu.:0.1000 1st Qu.:-0.15
Median : 2.1103 Median :0.4500 Median : 0.00
Mean : 2.1345 Mean :0.4813 Mean : 0.00
3rd Qu.: 3.7888 3rd Qu.:0.8500 3rd Qu.: 0.15
Max. : 8.8697 Max. :1.0000 Max. : 0.95
# 2.1 check distributions of X and Yp2.11<-ggplot(data_binx_conty, aes(x = Y)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of Y") +geom_vline(xintercept =mean(data_binx_conty$Y), linetype ="dashed", color ="red")p2.12<-ggplot(data_binx_conty, aes(x = X)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of X") +geom_vline(xintercept =mean(data_binx_conty$X), linetype ="dashed", color ="red")# Reduce to only cluster level variablesdata_binx_conty_level2 <- data_binx_conty %>%select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%distinct(Cluster, .keep_all =TRUE)# 2.2 histograms X.mean.j, p.X.mean.j and etap2.21<-ggplot(data_binx_conty_level2, aes(x = X.mean.j)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of X.mean.j") +geom_vline(xintercept =mean(data_binx_conty$X.mean.j), linetype ="dashed", color ="red")p2.22<-ggplot(data_binx_conty_level2, aes(x = p.X.mean.j)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of p.X.mean.j") +geom_vline(xintercept =mean(data_binx_conty$p.X.mean.j), linetype ="dashed", color ="red")p2.23<-ggplot(data_binx_conty_level2, aes(x = eta)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of eta") +geom_vline(xintercept =mean(data_binx_conty$eta), linetype ="dashed", color ="red")# eta is nicely distributed with -3 to 3 range# when sdX.between is large (e.g, sqrt(4)) the distribution of probabilities # of X and Y are U/bowl shaped (higher density at the edges)# when sdX.between is small (e.g., sqrt(0.4)) the distribution of probabilities# of X and Y are inverted U shaped (higher density in the center)# combine plotsgridExtra::grid.arrange(p2.11, p2.12, p2.21, p2.22, p2.23, ncol =2)
Code
# 2.3 scatterplot of p.X.mean.j and b0, b0 and eta, estimated cluster mean and p.X.mean.jp2.31<-ggplot(data_binx_conty_level2, aes(x = p.X.mean.j, y = b0)) +geom_point() +labs(title ="Scatterplot of p.X.mean.j and b0") +stat_cor(method ="pearson", digits =3)p2.32<-ggplot(data_binx_conty_level2, aes(x = b0, y = eta)) +geom_point() +labs(title ="Scatterplot of b0 and eta") +stat_cor(method ="pearson", digits =3)p2.33<-ggplot(data_binx_conty_level2, aes(x = p.X.mean.j, y = X.cluster.means)) +geom_point() +labs(title ="Scatterplot of X.cluster.means and p.X.mean.j", x ="True underlying probability", y ="Observed proportion") +stat_cor(method ="pearson", digits =3)# combine plotsgridExtra::grid.arrange(p2.31, p2.32, p2.33, ncol =2)
Code
# create overlapping density plots for X.cluster.means and p.X.mean.jp2.41<-ggplot(data_binx_conty_level2, aes(x = X.cluster.means, fill ="X.cluster.means")) +geom_histogram(alpha =0.8, binwidth =0.02) +geom_histogram(aes(x = p.X.mean.j, fill ="p.X.mean.j"), alpha =0.8, binwidth =0.02) +labs(title ="Density plot of X.cluster.means and p.X.mean.j", x ="value")# compute difference between estimated and true cluster mean for each clusterdata_binx_conty_level2$diff <- data_binx_conty_level2$X.cluster.means - data_binx_conty_level2$p.X.mean.j# plot histogram of differencesp2.42<-ggplot(data_binx_conty_level2, aes(x = diff)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of differences between estimated and true cluster mean") +geom_vline(xintercept =mean(data_binx_conty_level2$diff, na.rm =TRUE), linetype ="dashed", color ="red")# combine plotsgridExtra::grid.arrange(p2.41, p2.42, nrow =2)
Cluster Time X.mean.j b0
Min. : 1.0 Min. : 1.00 Min. :-9.07807 Min. :-28.1225
1st Qu.: 250.8 1st Qu.: 5.75 1st Qu.:-1.98921 1st Qu.: -6.2389
Median : 500.5 Median :10.50 Median : 0.03053 Median : 0.1808
Mean : 500.5 Mean :10.50 Mean : 0.08772 Mean : 0.2572
3rd Qu.: 750.2 3rd Qu.:15.25 3rd Qu.: 2.19022 3rd Qu.: 6.7083
Max. :1000.0 Max. :20.00 Max. :10.76135 Max. : 32.4569
b1 p.X.mean.j X eta
Min. :1.5 Mode:logical Min. :-10.61623 Min. :-44.0469
1st Qu.:1.5 NA's:20000 1st Qu.: -2.04541 1st Qu.: -8.9802
Median :1.5 Median : 0.05029 Median : 0.1885
Mean :1.5 Mean : 0.11174 Mean : 0.4248
3rd Qu.:1.5 3rd Qu.: 2.27474 3rd Qu.: 9.9190
Max. :1.5 Max. : 12.49205 Max. : 50.7784
Y p.Y X.cluster.means X.cent
Min. :0.000 Min. :0.0000000 Min. :-9.3529 Min. :-3.568000
1st Qu.:0.000 1st Qu.:0.0001259 1st Qu.:-1.9292 1st Qu.:-0.658154
Median :1.000 Median :0.5469822 Median : 0.0441 Median :-0.002651
Mean :0.505 Mean :0.5055416 Mean : 0.1117 Mean : 0.000000
3rd Qu.:1.000 3rd Qu.:0.9999508 3rd Qu.: 2.1982 3rd Qu.: 0.648543
Max. :1.000 Max. :1.0000000 Max. :10.7529 Max. : 3.689867
Code
fixef(glmer(Y ~ X.cent + X.cluster.means + (1| Cluster), data = data_contx_biny, family =binomial(link ="logit")))
# 3.1 check distributions of X and Yp3.11<-ggplot(data_contx_biny, aes(x = Y)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of Y") +geom_vline(xintercept =mean(data_contx_biny$Y), linetype ="dashed", color ="red")p3.12<-ggplot(data_contx_biny, aes(x = X)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of X") +geom_vline(xintercept =mean(data_contx_biny$X), linetype ="dashed", color ="red")# only keep level 2 variablesdata_contx_biny_level2 <- data_contx_biny %>%select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%distinct(Cluster, .keep_all =TRUE)# 3.2 check distributions of eta and p_yp3.21<-ggplot(data_contx_biny_level2, aes(x = eta)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of eta") +geom_vline(xintercept =mean(data_contx_biny$eta), linetype ="dashed", color ="red")p3.22<-ggplot(data_contx_biny, aes(x = p.Y)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of p_y") +geom_vline(xintercept =mean(data_contx_biny$p.Y), linetype ="dashed", color ="red")# combine plotsgridExtra::grid.arrange(p3.11, p3.12, p3.21, p3.22, ncol =2)
Code
# 3.3 scatterplot of b0 and eta and X.mean.j and X.cluster.meansp3.31<-ggplot(data_contx_biny, aes(x = b0, y = eta)) +geom_point() +labs(title ="Scatterplot of b0 and eta") +stat_cor(method ="pearson", digits =3)p3.32<-ggplot(data_contx_biny, aes(x = X.mean.j, y = X.cluster.means)) +geom_point() +labs(title ="Scatterplot of X.mean.j and X.cluster.means", x ="True cluster mean", y ="Estimated cluster mean") +stat_cor(method ="pearson", digits =3)# combine plotsgridExtra::grid.arrange(p3.31, p3.32, ncol =2)
Code
# create overlapping density plots for X.cluster.means and p.X.mean.jp3.41<-ggplot(data_contx_biny_level2, aes(x = X.cluster.means, fill ="X.cluster.means")) +geom_density(alpha =0.5) +geom_density(aes(x = X.mean.j, fill ="p.X.mean.j"), alpha =0.5) +labs(title ="Density plot of X.cluster.means and X.mean.j", x ="value")# compute difference between estimated and true cluster mean for each clusterdata_contx_biny_level2$diff <- data_contx_biny_level2$X.cluster.means - data_contx_biny_level2$X.mean.j# plot histogram of differencesp3.42<-ggplot(data_contx_biny_level2, aes(x = diff)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of differences between estimated and true cluster mean") +geom_vline(xintercept =mean(data_contx_biny_level2$diff), linetype ="dashed", color ="red")# combine plotsgridExtra::grid.arrange(p3.41, p3.42, nrow =2)
Cluster Time X.mean.j b0
Min. : 1.0 Min. : 1.00 Min. :-9.08314 Min. :-2.3044
1st Qu.: 250.8 1st Qu.: 5.75 1st Qu.:-2.03175 1st Qu.: 0.4382
Median : 500.5 Median :10.50 Median :-0.01095 Median : 1.4624
Mean : 500.5 Mean :10.50 Mean :-0.01244 Mean : 1.4754
3rd Qu.: 750.2 3rd Qu.:15.25 3rd Qu.: 1.98748 3rd Qu.: 2.4943
Max. :1000.0 Max. :20.00 Max. : 9.51789 Max. : 5.3999
b1 p.X.mean.j X eta
Min. :1.5 Min. :0.0001136 Min. :0.0000 Min. :-2.304
1st Qu.:1.5 1st Qu.:0.1159099 1st Qu.:0.0000 1st Qu.: 0.629
Median :1.5 Median :0.4972631 Median :0.0000 Median : 2.128
Mean :1.5 Mean :0.4943383 Mean :0.4939 Mean : 2.216
3rd Qu.:1.5 3rd Qu.:0.8794748 3rd Qu.:1.0000 3rd Qu.: 3.807
Max. :1.5 Max. :0.9999265 Max. :1.0000 Max. : 6.900
Y p.Y X.cluster.means X.cent
Min. :0.000 Min. :0.09076 Min. :0.0000 Min. :-0.95
1st Qu.:1.000 1st Qu.:0.65226 1st Qu.:0.1000 1st Qu.:-0.10
Median :1.000 Median :0.89358 Median :0.5000 Median : 0.00
Mean :0.792 Mean :0.79295 Mean :0.4939 Mean : 0.00
3rd Qu.:1.000 3rd Qu.:0.97828 3rd Qu.:0.9000 3rd Qu.: 0.10
Max. :1.000 Max. :0.99899 Max. :1.0000 Max. : 0.95
Code
fixef(glmer(Y ~ X.cent + X.cluster.means + (1| Cluster), data = data_binary, family =binomial(link ="logit")))
# 4.1 check distributions of X and Yp4.11<-ggplot(data_binary, aes(x = Y)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of Y") +geom_vline(xintercept =mean(data_binary$Y), linetype ="dashed", color ="red")p4.12<-ggplot(data_binary, aes(x = X)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of X") +geom_vline(xintercept =mean(data_binary$X), linetype ="dashed", color ="red")# only keep level 2 variablesdata_binary_level2 <- data_binary %>%select(Cluster, X.cluster.means, X.mean.j, p.X.mean.j, eta, b0, b1) %>%distinct(Cluster, .keep_all =TRUE)# 4.2 histograms with density overlay for evaluation of probability distributionsp4.21<-ggplot(data_binary_level2, aes(x = eta)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of eta") +geom_vline(xintercept =mean(data_binary$eta), linetype ="dashed", color ="red")p4.22<-ggplot(data_binary_level2, aes(x = p.X.mean.j)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of p.X.mean.j") +geom_vline(xintercept =mean(data_binary$p.X.mean.j), linetype ="dashed", color ="red")p4.23<-ggplot(data_binary, aes(x = p.Y)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of p_y") +geom_vline(xintercept =mean(data_binary$p.Y), linetype ="dashed", color ="red")# when sdX.between is large (e.g, sqrt(4)) the distribution of probabilities # of X and Y are U/bowl shaped (higher density at the edges)# when sdX.between is small (e.g., sqrt(0.4)) the distribution of probabilities # of X and Y are inverted U shaped (higher density in the center)# when sdX.between is moderate (e.g., sqrt(2)) the distribution of probabilities# is more uniform but inverted U shaped for X and slightly U shaped for Y.# combine plotsgridExtra::grid.arrange(p4.11, p4.12, p4.21, p4.22, p4.23, ncol =2)
Code
# 4.3 scatterplots for evaluation of relationship between probabilities and originating variables# between p.X.mean.j and b0p4.31<-ggplot(data_binary_level2, aes(x = p.X.mean.j, y = b0)) +geom_point() +labs(title ="Scatterplot of p.X.mean.j and b0") +stat_cor(method ="pearson", digits =3)# between b0 and etap4.32<-ggplot(data_binary_level2, aes(x = b0, y = eta)) +geom_point() +labs(title ="Scatterplot of b0 and eta") +stat_cor(method ="pearson", digits =3)# between X.cluster.means and p.X.mean.jp4.33<-ggplot(data_binary_level2, aes(x = p.X.mean.j, y = X.cluster.means)) +geom_point() +labs(title ="Scatterplot of X.cluster.means and p.X.mean.j", x ="True underlying probability", y ="Observed proportion") +stat_cor(method ="pearson", digits =3)# combine plotsgridExtra::grid.arrange(p4.31, p4.32, p4.33, ncol =2)
Code
# Other: plots below only really indicate logit relationship# ggplot(data_binary, aes(x = X.mean.j, y = p.X.mean.j)) + geom_point() + labs(title = "Scatterplot of X.mean.j and p.X.mean.j")# As expected, we see a a sigmoid curve representing the logit relationship between X.mean.j # and p.X.mean.j. It is more pronounced (different from linear) when sdX.between is large.# ggplot(data_binary, aes(x = eta, y = p.Y)) + geom_point() + labs(title = "Scatterplot of eta and p_y")# As expected again, we see a sigmoid curve representing the logit relationship between eta# and p_y. It is more pronounced (different from linear) when sdX.between and sd.u0 are large.# create overlapping density plots for X.cluster.means and p.X.mean.jp4.41<-ggplot(data_binary_level2, aes(x = X.cluster.means, fill ="X.cluster.means")) +geom_histogram(alpha =0.8, binwidth =0.02) +geom_histogram(aes(x = p.X.mean.j, fill ="p.X.mean.j"), alpha =0.8) +labs(title ="Density plot of X.cluster.means and p.X.mean.j", x ="value")# compute difference between estimated and true cluster mean for each clusterdata_binary_level2$diff <- data_binary_level2$X.cluster.means - data_binary_level2$p.X.mean.j# plot histogram of differencesp4.42<-ggplot(data_binary_level2, aes(x = diff)) +geom_histogram(aes(y = ..density..), bins =30, fill ="lightblue", color ="black") +geom_density(alpha =0.5, fill ="blue") +labs(title ="Histogram of differences between estimated and true cluster mean") +geom_vline(xintercept =mean(data_binary_level2$diff), linetype ="dashed", color ="red")# combine plotsgridExtra::grid.arrange(p4.41, p4.42, nrow =2)