Using Data Simulation in the Classroom

https://debruine.github.io/talks/Leuven-2025/

Lisa DeBruine

Abstract

Being able to simulate data allows you to prep analysis scripts for pre-registration, calculate power and sensitivity for analyses that don’t have empirical methods, create reproducible examples when your data are too big or confidential to share, enhance your understanding of statistical concepts, and create demo data for teaching and tutorials. I will introduce the R package {faux} for data simulation from descriptive statistics. I will also give several examples of how to use data simulation in teaching statistical concepts.

psyteachr.github.io

Why Simulate Data?

Reproducible Examples

Create reproducible examples when your data are too big or confidential to share

Talk: Replicability of results in the context of private non-sharable data

Pre-Registration

Prep analysis scripts for pre-registration or registered reports

Power

Calculate power and sensitivity for analyses that don’t have empirical methods

Enhance Understanding

Enhance your understanding of statistical or other complex concepts

Teaching Data

Create demo data for teaching and tutorials

Faux

Shiny app or scriptable

Extensive Documentation

]

Faux Code

sim_data <- faux::sim_design(
  within = list( version   = c(V1 = "Version 1",    V2 = "Version 2"), 
                 condition = c(ctl = "Control",     exp = "Experimental")),
  between = list(age_group = c(young = "Age 20-29", old = "Age 70-79")),
  n = 30,
  mu = c(100, 100, 100, 100, 100, 90, 110, 110),
  sd = 20,
  r = 0.5,
  dv = c(score = "Score"),
  id = c(id = "Subject ID"),
  vardesc = list(version   = "Task Version", 
                 condition = "Experiment Condition", 
                 age_group = "Age Group"),
  long = TRUE
)

Faux Design Parameters

age_group	version	condition	V1_ctl	V1_exp	V2_ctl	V2_exp	n	mu	sd
young	V1	ctl	1.0	0.5	0.5	0.5	30	100	20
young	V1	exp	0.5	1.0	0.5	0.5	30	100	20
young	V2	ctl	0.5	0.5	1.0	0.5	30	100	20
young	V2	exp	0.5	0.5	0.5	1.0	30	100	20
old	V1	ctl	1.0	0.5	0.5	0.5	30	100	20
old	V1	exp	0.5	1.0	0.5	0.5	30	90	20
old	V2	ctl	0.5	0.5	1.0	0.5	30	110	20
old	V2	exp	0.5	0.5	0.5	1.0	30	110	20

Faux Design Plot

sim_data |> get_design() |> plot()

Faux Data Plot

sim_data |> plot(geoms = c("violin", "pointrangeSE"))

Power Simulation: Replicate Data

sim_data <- faux::sim_design(
  within = list(version = c(V1 = "Version 1", V2 = "Version 2"), 
                condition = c(ctl = "Control", exp = "Experimental")),
  between = list(age_group = c(young = "Age 20-29", old = "Age 70-79")),
  n = 30,
  mu = c(100, 100, 100, 100, 100, 90, 110, 110),
  sd = 20,
  r = 0.5,
  dv = c(score = "Score"),
  id = c(id = "Subject ID"),
  vardesc = list(version = "Task Version", 
                 condition = "Experiment Condition", 
                 age_group = "Age Group"),
  long = TRUE,
  rep = 100
)

Power Simulation: Analysis Function

# setup options to avoid annoying afex message & run faster
afex::set_sum_contrasts()
afex::afex_options(include_aov = FALSE) 

analysis_func <- function(data) {
  a <- afex::aov_ez(
    id = "id", 
    dv = "score", 
    between = "age_group",
    within = c("version", "condition"),
    data = data)
  
  as_tibble(a$anova_table, rownames = "term") |>
    rename(p = `Pr(>F)`)
}

Power Simulation: Analysis Result

# test on first data set
analysis_func(sim_data$data[[1]])

term	num Df	den Df	MSE	F	ges	p
age_group	1	58	1,184.9	0.07	0.001	0.795
version	1	58	206.3	18.17	0.035	0.000
age_group:version	1	58	206.3	21.46	0.042	0.000
condition	1	58	169.6	2.05	0.003	0.158
age_group:condition	1	58	169.6	0.22	0.000	0.640
version:condition	1	58	197.3	4.60	0.009	0.036
age_group:version:condition	1	58	197.3	0.40	0.001	0.529

Power Simulation

power <- sim_data |>
  mutate(analysis = purrr::map(data, analysis_func)) |>
  select(-data) |>
  unnest(analysis) |>
  group_by(term) |>
  summarise(power = mean(p < .05))

term	power
age_group	0.10
age_group:condition	0.24
age_group:version	0.99
age_group:version:condition	0.24
condition	0.24
version	1.00
version:condition	0.30

Mixed Effects

Random factors

subj_n <- 2 # number of subjects

lmem_dat <- add_random(subj = subj_n)

subj
subj1
subj2

Crossed random factors

item_n <- 2 # number of items

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n)

subj	item
subj1	item1
subj1	item2
subj2	item1
subj2	item2

Nested random factors

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n, .nested_in = "subj")

subj	item
subj1	item1
subj1	item2
subj2	item3
subj2	item4

Fixed factors (within)

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental"))

subj	item	condition
subj1	item1	control
subj1	item1	experimental
subj1	item2	control
subj1	item2	experimental
subj2	item1	control
subj2	item1	experimental
subj2	item2	control
subj2	item2	experimental

Fixed factors (between)

gender_prob <- c(20, 75, 5) # gender category distribution

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob)

subj	item	condition	gender
subj1	item1	control	M
subj1	item1	experimental	M
subj1	item2	control	M
subj1	item2	experimental	M
subj2	item1	control	M
subj2	item1	experimental	M
subj2	item2	control	M
subj2	item2	experimental	M

Contrast Coding

My name	Other names	R function	faux function
Treatment	Treatment (2), Dummy (1, 4, 6), Simple (5)	`contr.treatment`	`contr_code_treatment`
Anova	Deviation (2), Contrast (1), Simple (4)	`contr.treatment - 1/k`	`contr_code_anova`
Sum	Sum (1, 2, 6), Effects (3), Deviation (4, 5), Unweighted Effects (7)	`contr.sum`	`contr_code_sum`
Difference	Contrast (3), Forward/Backward (4), Repeated (5)	`MASS::contr.sdif`	`contr_code_difference`
Helmert	Reverse Helmert (1, 4), Difference (5), Contrast (7)	`contr.helmert / (column_i+1)`	`contr_code_helmert`
Polynomial	Polynomial (5), Orthogonal Polynomial (4), Trend (3)	`contr.poly`	`contr_code_poly`

Explainer

Codings

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "treat") |>
  add_contrast("condition", "anova", colnames = "anova") |>
  add_contrast("condition", "sum", colnames = "sum")

subj	item	condition	gender	treat	anova	sum
subj1	item1	control	M	0	-0.5	1
subj1	item1	experimental	M	1	0.5	-1
subj1	item2	control	M	0	-0.5	1
subj1	item2	experimental	M	1	0.5	-1
subj2	item1	control	F	0	-0.5	1
subj2	item1	experimental	F	1	0.5	-1
subj2	item2	control	F	0	-0.5	1
subj2	item2	experimental	F	1	0.5	-1

Design

# define this to make room later

design <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "cond")

subj	item	condition	gender	cond
subj1	item1	control	F	0
subj1	item1	experimental	F	1
subj1	item2	control	F	0
subj1	item2	experimental	F	1
subj2	item1	control	M	0
subj2	item1	experimental	M	1
subj2	item2	control	M	0
subj2	item2	experimental	M	1

Fixed Effects

intercept <- 100 # model intercept (mean for control condition)
cond_eff  <-  10 # condition effect size

lmem_dat <- design |>
  mutate(dv = intercept + 
           cond * cond_eff)

subj	item	condition	gender	cond	dv
subj1	item1	control	F	0	100
subj1	item1	experimental	F	1	110
subj1	item2	control	F	0	100
subj1	item2	experimental	F	1	110
subj2	item1	control	M	0	100
subj2	item1	experimental	M	1	110
subj2	item2	control	M	0	100
subj2	item2	experimental	M	1	110

Error Term

error_sd <- 20 # SD of trial-level error (residuals)

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  mutate(dv = intercept + err +
           cond * cond_eff)

subj	item	condition	gender	cond	err	dv
subj1	item1	control	F	0	-8.08	91.92
subj1	item1	experimental	F	1	-23.96	86.04
subj1	item2	control	F	0	0.23	100.23
subj1	item2	experimental	F	1	5.42	115.42
subj2	item1	control	M	0	1.00	101.00
subj2	item1	experimental	M	1	16.41	126.41
subj2	item2	control	M	0	-20.19	79.81
subj2	item2	experimental	M	1	-6.39	103.61

Random Intercepts

subj_sd      <-   8 # SD of subject-level intercepts
item_sd      <-   4 # SD of item-level intercepts

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", subj_i = subj_sd) |>
  add_ranef(.by = "item", item_i = item_sd) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * cond_eff)

subj	item	condition	gender	cond	err	subj_i	item_i	dv
subj1	item1	control	F	0	13.58	6.35	2.42	122.35
subj1	item1	experimental	F	1	5.94	6.35	2.42	124.70
subj1	item2	control	F	0	15.91	6.35	0.27	122.53
subj1	item2	experimental	F	1	16.86	6.35	0.27	133.48
subj2	item1	control	M	0	4.08	-12.49	2.42	94.01
subj2	item1	experimental	M	1	-16.53	-12.49	2.42	83.40
subj2	item2	control	M	0	-4.60	-12.49	0.27	83.19
subj2	item2	experimental	M	1	9.79	-12.49	0.27	107.57

Random Slopes

subj_cond_sd <-   5 # SD of subject-level condition effect size
item_cond_sd <-  15 # SD of item-level condition effect size
subj_cors    <-   0.5 # correlation between subject intercept and slope
item_cors    <-  -0.5 # correlation between item intercept and slope

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", 
            subj_i = subj_sd, 
            subj_cond = subj_cond_sd, 
            .cors = subj_cors) |>
  add_ranef(.by = "item", 
            item_i = item_sd, 
            item_cond = item_cond_sd, 
            .cors = item_cors) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * (cond_eff + subj_cond + item_cond))

Random Slopes

subj	item	condition	gender	cond	err	subj_i	subj_cond	item_i	item_cond	dv
subj1	item1	control	F	0	-8.13	3.30	5.51	-2.06	3.12	93.11
subj1	item1	experimental	F	1	27.74	3.30	5.51	-2.06	3.12	147.61
subj1	item2	control	F	0	30.25	3.30	5.51	-1.17	5.71	132.39
subj1	item2	experimental	F	1	-2.88	3.30	5.51	-1.17	5.71	120.48
subj2	item1	control	M	0	-57.82	7.53	-0.82	-2.06	3.12	47.65
subj2	item1	experimental	M	1	-31.04	7.53	-0.82	-2.06	3.12	86.73
subj2	item2	control	M	0	0.45	7.53	-0.82	-1.17	5.71	106.81
subj2	item2	experimental	M	1	-22.36	7.53	-0.82	-1.17	5.71	98.89

Parameters

subj_n       <- 100   # number of subjects
item_n       <-  20   # number of items
gender_prob  <- c(20, 75, 5) # gender category distribution
intercept    <- 100   # model intercept (mean for control condition)
cond_eff     <-  10   # condition effect size
error_sd     <-  20   # SD of trial-level error (residuals)
subj_sd      <-   8   # SD of subject-level intercepts
item_sd      <-   4   # SD of item-level intercepts
subj_cond_sd <-   5   # SD of subject-level condition effect size
item_cond_sd <-  15   # SD of item-level condition effect size
subj_cors    <-   0.5 # correlation between subject intercept and slope
item_cors    <-  -0.5 # correlation between item intercept and slope

Full Data Simulation Code

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "cond") |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", 
            subj_i = subj_sd, 
            subj_cond = subj_cond_sd, 
            .cors = subj_cors) |>
  add_ranef(.by = "item", 
            item_i = item_sd, 
            item_cond = item_cond_sd, 
            .cors = item_cors) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * (cond_eff + subj_cond + item_cond))

LMEM Analysis

lmer(dv ~ cond + 
       (1 + cond | subj) + 
       (1 + cond | item),
     data = lmem_dat) |> summary()

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: dv ~ cond + (1 + cond | subj) + (1 + cond | item)
   Data: lmem_dat

REML criterion at convergence: 35792.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4674 -0.6511 -0.0066  0.6492  3.6816 

Random effects:
 Groups   Name        Variance Std.Dev. Corr 
 subj     (Intercept)  48.191   6.942        
          cond         24.472   4.947   0.46 
 item     (Intercept)   7.951   2.820        
          cond        145.204  12.050   -0.07
 Residual             415.973  20.395        
Number of obs: 4000, groups:  subj, 100; item, 20

Fixed effects:
            Estimate Std. Error      df t value Pr(>|t|)    
(Intercept)  101.474      1.043  49.032  97.308  < 2e-16 ***
cond           9.509      2.814  20.202   3.379  0.00295 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
     (Intr)
cond -0.056

Power simulation

Wrap dataset creation and analysis in a function that returns the values you care about as a data frame
Iterate this function and save the output as a data frame
Summarise the output (e.g., power, range of effect sizes)

Function Outline

sim_func <- function(iteration = 0, subj_n = 100, item_n = 20, cond_eff = 0) {
  # variables not in function arguments

  # simulate the data
  lmem_dat <- ...
  
  # run the analysis
  mod <- lmer(...)
  
  # return a table of fixed effects only
  broom.mixed::tidy(mod, effects = "fixed") |>
    mutate(iteration = iteration)
}

Full Function

sim_func <- function(iteration = 0, subj_n = 100, item_n = 20, cond_eff = 0) {
  # variables not in function arguments
  gender_prob  <- c(20, 75, 5)
  intercept    <- 100
  error_sd     <-  20
  subj_sd      <-   8
  item_sd      <-   4
  subj_cond_sd <-   5
  item_cond_sd <-  15
  subj_cors    <-   0.5
  item_cors    <-  -0.5

# simulate the data
  lmem_dat <- add_random(subj = subj_n) |>
    add_random(item = item_n) |>
    add_within(condition = c("control", "experimental")) |>
    add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
    add_contrast("condition", "treatment", colnames = "cond") |>
    add_ranef(err = error_sd) |>
    add_ranef(.by = "subj", 
              subj_i = subj_sd, 
              subj_cond = subj_cond_sd, 
              .cors = subj_cors) |>
    add_ranef(.by = "item", 
              item_i = item_sd, 
              item_cond = item_cond_sd, 
              .cors = item_cors) |>
    mutate(dv = intercept + subj_i + item_i + err +
             cond * (cond_eff + subj_cond + item_cond))
  
  # run the analysis
  mod <- lmer(dv ~ cond + 
       (1 + cond | subj) + 
       (1 + cond | item),
     data = lmem_dat)
  
  # return a table of fixed effects only
  broom.mixed::tidy(mod, effects = "fixed") |>
    mutate(iteration = iteration)
}

Test the Function

sim_func()

effect	term	estimate	std.error	statistic	df	p.value
fixed	(Intercept)	102.73	1.24	83.15	60.34	0.00
fixed	cond	-5.02	3.51	-1.43	19.51	0.17

# again to make sure iterations differ
sim_func()

effect	term	estimate	std.error	statistic	df	p.value
fixed	(Intercept)	100.28	1.25	80.14	53.40	0.00
fixed	cond	-1.78	4.10	-0.43	19.33	0.67

# change effect size
sim_func(cond_eff = 10)

effect	term	estimate	std.error	statistic	df	p.value
fixed	(Intercept)	100.09	1.22	81.90	61.96	0.00
fixed	cond	8.20	2.75	2.98	20.59	0.01

Iterate

set.seed(8675309)

sim_results <- map_df(1:10, sim_func, cond_eff = 10)

effect	term	estimate	std.error	statistic	df	p.value	iteration
fixed	(Intercept)	98.97	0.99	99.85	67.08	0.00	1
fixed	cond	11.69	3.18	3.67	20.22	0.00	1
fixed	(Intercept)	102.87	1.09	94.34	57.20	0.00	2
fixed	cond	7.64	2.85	2.68	19.88	0.01	2
fixed	(Intercept)	100.08	1.43	70.04	33.77	0.00	3
fixed	cond	6.06	4.42	1.37	19.22	0.19	3

Summarise

sim_results |>
  summarise(power = mean(p.value < 0.05), .by = term)

term	power
(Intercept)	1.0
cond	0.7

Further Resources

Thank You!

https://debruine.github.io/talks/Leuven-2025/