Data simulation with {faux} for mixed designs

debruine.github.io/talks/lmemsim-2024

Lisa DeBruine

Abstract

Being able to simulate data allows you to prep analysis scripts for pre-registration, calculate power and sensitivity for analyses that don’t have empirical methods, create reproducible examples when your data are too big or confidential to share, enhance your understanding of statistical concepts, and create demo data for teaching and tutorials. In this talk, I will introduce the basics of simulation using the R package {faux}. We will focus on simulating data from a mixed design where trials are crossed with subjects, analysing this using {lme4}, understanding how the simulation parameters correspond to the output, and using simulation to calculate power.

Why Simulate Data?

Pre-Registration

Prep analysis scripts for pre-registration or registered reports

Power

Calculate power and sensitivity for analyses that don’t have empirical methods

Reproducible Examples

Create reproducible examples when your data are too big or confidential to share

Enhance Understanding

Enhance your understanding of statistical or other complex concepts

Teaching Data

Create demo data for teaching and tutorials

Faux

Shiny app or R package

Extensive Documentation

]

Faux Code

sim_data <- faux::sim_design(
  between = list(age_group = c(young = "Age 20-29", old = "Age 70-79")),
  within = list(version = c(V1 = "Version 1", V2 = "Version 2"), 
                condition = c(ctl = "Control", exp = "Experimental")),
  n = 30,
  mu = data.frame(
    young = c(100, 100, 100, 100), 
    old = c(100, 90, 110, 110),
    row.names = c("V1_ctl", "V1_exp", "V2_ctl", "V2_exp")
  ),
  sd = 20,
  r = 0.5,
  dv = c(score = "Score"),
  id = c(id = "Subject ID"),
  vardesc = list(version = "Task Version", 
                 condition = "Experiment Condition", 
                 age_group = "Age Group"),
  long = TRUE
)

Faux Design Parameters

get_design(sim_data)

age_group	version	condition	V1_ctl	V1_exp	V2_ctl	V2_exp	n	mu	sd
young	V1	ctl	1.0	0.5	0.5	0.5	30	100	20
young	V1	exp	0.5	1.0	0.5	0.5	30	100	20
young	V2	ctl	0.5	0.5	1.0	0.5	30	100	20
young	V2	exp	0.5	0.5	0.5	1.0	30	100	20
old	V1	ctl	1.0	0.5	0.5	0.5	30	100	20
old	V1	exp	0.5	1.0	0.5	0.5	30	90	20
old	V2	ctl	0.5	0.5	1.0	0.5	30	110	20
old	V2	exp	0.5	0.5	0.5	1.0	30	110	20

Faux Design Plot

sim_data |> get_design() |> plot()

Faux Data Plot

sim_data |> plot(geoms = c("violin", "pointrangeSE"))

Mixed Effects

Random factors

subj_n <- 2 # number of subjects

lmem_dat <- add_random(subj = subj_n)

subj
subj1
subj2

Crossed random factors

item_n <- 2 # number of items

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n)

subj	item
subj1	item1
subj1	item2
subj2	item1
subj2	item2

Nested random factors

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n, .nested_in = "subj")

subj	item
subj1	item1
subj1	item2
subj2	item3
subj2	item4

Fixed factors (within)

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental"))

subj	item	condition
subj1	item1	control
subj1	item1	experimental
subj1	item2	control
subj1	item2	experimental
subj2	item1	control
subj2	item1	experimental
subj2	item2	control
subj2	item2	experimental

Fixed factors (between)

gender_prob <- c(20, 75, 5) # gender category distribution

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob)

subj	item	condition	gender
subj1	item1	control	M
subj1	item1	experimental	M
subj1	item2	control	M
subj1	item2	experimental	M
subj2	item1	control	M
subj2	item1	experimental	M
subj2	item2	control	M
subj2	item2	experimental	M

Contrast Coding

My name	Other names	R function	faux function
Treatment	Treatment (2), Dummy (1, 4, 6), Simple (5)	`contr.treatment`	`contr_code_treatment`
Anova	Deviation (2), Contrast (1), Simple (4)	`contr.treatment - 1/k`	`contr_code_anova`
Sum	Sum (1, 2, 6), Effects (3), Deviation (4, 5), Unweighted Effects (7)	`contr.sum`	`contr_code_sum`
Difference	Contrast (3), Forward/Backward (4), Repeated (5)	`MASS::contr.sdif`	`contr_code_difference`
Helmert	Reverse Helmert (1, 4), Difference (5), Contrast (7)	`contr.helmert / (column_i+1)`	`contr_code_helmert`
Polynomial	Polynomial (5), Orthogonal Polynomial (4), Trend (3)	`contr.poly`	`contr_code_poly`

Explainer

Codings

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "treat") |>
  add_contrast("condition", "anova", colnames = "anova") |>
  add_contrast("condition", "sum", colnames = "sum")

subj	item	condition	gender	treat	anova	sum
subj1	item1	control	F	0	-0.5	1
subj1	item1	experimental	F	1	0.5	-1
subj1	item2	control	F	0	-0.5	1
subj1	item2	experimental	F	1	0.5	-1
subj2	item1	control	F	0	-0.5	1
subj2	item1	experimental	F	1	0.5	-1
subj2	item2	control	F	0	-0.5	1
subj2	item2	experimental	F	1	0.5	-1

Design

# define this to make room later

design <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "cond")

subj	item	condition	gender	cond
subj1	item1	control	F	0
subj1	item1	experimental	F	1
subj1	item2	control	F	0
subj1	item2	experimental	F	1
subj2	item1	control	F	0
subj2	item1	experimental	F	1
subj2	item2	control	F	0
subj2	item2	experimental	F	1

Fixed Effects

intercept <- 100 # model intercept (mean for control condition)
cond_eff  <-  10 # condition effect size

lmem_dat <- design |>
  mutate(dv = intercept + 
           cond * cond_eff)

subj	item	condition	gender	cond	dv
subj1	item1	control	F	0	100
subj1	item1	experimental	F	1	110
subj1	item2	control	F	0	100
subj1	item2	experimental	F	1	110
subj2	item1	control	F	0	100
subj2	item1	experimental	F	1	110
subj2	item2	control	F	0	100
subj2	item2	experimental	F	1	110

Error Term

error_sd <- 20 # SD of trial-level error (residuals)

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  mutate(dv = intercept + err +
           cond * cond_eff)

subj	item	condition	gender	cond	err	dv
subj1	item1	control	F	0.00	−11.09	88.91
subj1	item1	experimental	F	1.00	−30.34	79.66
subj1	item2	control	F	0.00	−16.56	83.44
subj1	item2	experimental	F	1.00	14.88	124.88
subj2	item1	control	F	0.00	−20.52	79.48
subj2	item1	experimental	F	1.00	19.75	129.75
subj2	item2	control	F	0.00	−17.96	82.04
subj2	item2	experimental	F	1.00	−38.54	71.46

Random Intercepts

subj_sd      <-   8 # SD of subject-level intercepts
item_sd      <-   4 # SD of item-level intercepts

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", subj_i = subj_sd) |>
  add_ranef(.by = "item", item_i = item_sd) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * cond_eff)

subj	item	condition	gender	cond	err	subj_i	item_i	dv
subj1	item1	control	F	0.00	−44.04	−3.24	3.03	55.75
subj1	item1	experimental	F	1.00	−22.93	−3.24	3.03	86.86
subj1	item2	control	F	0.00	12.43	−3.24	2.02	111.20
subj1	item2	experimental	F	1.00	17.01	−3.24	2.02	125.78
subj2	item1	control	F	0.00	−14.50	−7.99	3.03	80.54
subj2	item1	experimental	F	1.00	27.81	−7.99	3.03	132.84
subj2	item2	control	F	0.00	−6.31	−7.99	2.02	87.71
subj2	item2	experimental	F	1.00	−37.69	−7.99	2.02	66.34

Random Slopes

subj_cond_sd <-   5 # SD of subject-level condition effect size
item_cond_sd <-  15 # SD of item-level condition effect size
subj_cors    <-   0.5 # correlation between subject intercept and slope
item_cors    <-  -0.5 # correlation between item intercept and slope

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", 
            subj_i = subj_sd, 
            subj_cond = subj_cond_sd, 
            .cors = subj_cors) |>
  add_ranef(.by = "item", 
            item_i = item_sd, 
            item_cond = item_cond_sd, 
            .cors = item_cors) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * (cond_eff + subj_cond + item_cond))

Random Slopes

subj	item	condition	gender	cond	err	subj_i	subj_cond	item_i	item_cond	dv
subj1	item1	control	F	0.00	21.78	−8.67	−2.06	2.66	−3.42	115.76
subj1	item1	experimental	F	1.00	−25.00	−8.67	−2.06	2.66	−3.42	73.50
subj1	item2	control	F	0.00	42.32	−8.67	−2.06	−4.72	20.12	128.93
subj1	item2	experimental	F	1.00	−3.23	−8.67	−2.06	−4.72	20.12	111.45
subj2	item1	control	F	0.00	26.91	−13.94	−2.69	2.66	−3.42	115.63
subj2	item1	experimental	F	1.00	−17.59	−13.94	−2.69	2.66	−3.42	75.02
subj2	item2	control	F	0.00	8.65	−13.94	−2.69	−4.72	20.12	89.99
subj2	item2	experimental	F	1.00	−36.14	−13.94	−2.69	−4.72	20.12	72.64

Parameters

subj_n       <- 100 # number of subjects
item_n       <-  20 # number of items
gender_prob  <- c(20, 75, 5) # gender category distribution
intercept    <- 100 # model intercept (mean for control condition)
cond_eff     <-  10 # condition effect size
error_sd     <-  20 # SD of trial-level error (residuals)
subj_sd      <-   8 # SD of subject-level intercepts
item_sd      <-   4 # SD of item-level intercepts
subj_cond_sd <-   5 # SD of subject-level condition effect size
item_cond_sd <-  15 # SD of item-level condition effect size
subj_cors    <-   0.5 # correlation between subject intercept and slope
item_cors    <-  -0.5 # correlation between item intercept and slope

Full Data Simulation Code

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "cond") |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", 
            subj_i = subj_sd, 
            subj_cond = subj_cond_sd, 
            .cors = subj_cors) |>
  add_ranef(.by = "item", 
            item_i = item_sd, 
            item_cond = item_cond_sd, 
            .cors = item_cors) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * (cond_eff + subj_cond + item_cond))

LM Analysis

# aggregate data over subjects
agg_dat <- lmem_dat |> 
  summarise(dv = mean(dv), .by = c(subj, cond))

lm(dv ~ cond, data = agg_dat) |> summary()


Call:
lm(formula = dv ~ cond, data = agg_dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-19.1018  -7.2699  -0.5588   6.9811  27.7265 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 100.2428     0.9473 105.825  < 2e-16 ***
cond         10.8725     1.3396   8.116 5.01e-14 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.473 on 198 degrees of freedom
Multiple R-squared:  0.2496,    Adjusted R-squared:  0.2458 
F-statistic: 65.87 on 1 and 198 DF,  p-value: 5.014e-14

LMEM Analysis

lmer(dv ~ cond + 
       (1 + cond | subj) + 
       (1 + cond | item),
     data = lmem_dat) |> summary()

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: dv ~ cond + (1 + cond | subj) + (1 + cond | item)
   Data: lmem_dat

REML criterion at convergence: 35323.9

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.7821 -0.6676 -0.0080  0.6296  3.5944 

Random effects:
 Groups   Name        Variance Std.Dev. Corr 
 subj     (Intercept)  51.79    7.196        
          cond         13.58    3.685   0.48 
 item     (Intercept)  15.14    3.891        
          cond        225.11   15.004   -0.58
 Residual             368.84   19.205        
Number of obs: 4000, groups:  subj, 100; item, 20

Fixed effects:
            Estimate Std. Error      df t value Pr(>|t|)    
(Intercept)  100.243      1.208  41.244   82.98  < 2e-16 ***
cond          10.872      3.429  19.442    3.17  0.00493 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
     (Intr)
cond -0.426

Power simulation

Wrap dataset creation and analysis in a function that returns the values you care about as a data frame
Iterate this function and save the output as a data frame
Summarise the output (e.g., power, range of effect sizes)

Function Outline

sim_func <- function(iteration = 0, subj_n = 100, item_n = 20, cond_eff = 0) {
  # variables not in function arguments

  # simulate the data
  lmem_dat <- ...
  
  # run the analysis
  mod <- lmer(...)
  
  # return a table of fixed effects only
  broom.mixed::tidy(mod, effects = "fixed") |>
    mutate(iteration = iteration)
}

Full Function

sim_func <- function(iteration = 0, subj_n = 100, item_n = 20, cond_eff = 0) {
  # variables not in function arguments
  gender_prob  <- c(20, 75, 5)
  intercept    <- 100
  error_sd     <-  20
  subj_sd      <-   8
  item_sd      <-   4
  subj_cond_sd <-   5
  item_cond_sd <-  15
  subj_cors    <-   0.5
  item_cors    <-  -0.5

  # simulate the data
  lmem_dat <- add_random(subj = subj_n) |>
    add_random(item = item_n) |>
    add_within(condition = c("control", "experimental")) |>
    add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
    add_contrast("condition", "treatment", colnames = "cond") |>
    add_ranef(err = error_sd) |>
    add_ranef(.by = "subj", 
              subj_i = subj_sd, 
              subj_cond = subj_cond_sd, 
              .cors = subj_cors) |>
    add_ranef(.by = "item", 
              item_i = item_sd, 
              item_cond = item_cond_sd, 
              .cors = item_cors) |>
    
    mutate(dv = intercept + subj_i + item_i + err +
             cond * (cond_eff + subj_cond + item_cond))
  
  # run the analysis
  mod <- lmer(dv ~ cond + 
       (1 + cond | subj) + 
       (1 + cond | item),
     data = lmem_dat)
  
  # return a table of fixed effects only
  broom.mixed::tidy(mod, effects = "fixed") |>
    mutate(iteration = iteration)
}

Test the Function

sim_func()

effect	term	estimate	std.error	statistic	df	p.value	iteration
fixed	(Intercept)	100.37	1.18	85.03	42.60	0.00	0
fixed	cond	−1.54	2.63	−0.58	20.00	0.57	0

# again to make sure iterations differ
sim_func()

effect	term	estimate	std.error	statistic	df	p.value	iteration
fixed	(Intercept)	98.44	1.24	79.34	39.58	0.00	0
fixed	cond	9.16	2.80	3.27	19.78	0.00	0

# change effect size
sim_func(cond_eff = 10)

effect	term	estimate	std.error	statistic	df	p.value	iteration
fixed	(Intercept)	97.78	1.25	78.11	55.32	0.00	0
fixed	cond	9.59	2.84	3.38	20.56	0.00	0

Iterate

set.seed(8675309)

sim_results <- map_df(1:10, sim_func, cond_eff = 10)

effect	term	estimate	std.error	statistic	df	p.value	iteration
fixed	(Intercept)	98.97	0.99	99.85	67.08	0.00	1
fixed	cond	11.69	3.18	3.67	20.22	0.00	1
fixed	(Intercept)	102.87	1.09	94.33	57.18	0.00	2
fixed	cond	7.64	2.85	2.68	19.87	0.01	2
fixed	(Intercept)	100.08	1.43	70.04	33.77	0.00	3
fixed	cond	6.06	4.42	1.37	19.22	0.19	3

Summarise

sim_results |>
  summarise(power = mean(p.value < 0.05), .by = term)

term	power
(Intercept)	1.00
cond	0.70

Further Resources

Thank You!