Using Data Simulation in the Classroom

https://debruine.github.io/talks/Leuven-2025/

Lisa DeBruine

Abstract

Being able to simulate data allows you to prep analysis scripts for pre-registration, calculate power and sensitivity for analyses that don’t have empirical methods, create reproducible examples when your data are too big or confidential to share, enhance your understanding of statistical concepts, and create demo data for teaching and tutorials. I will introduce the R package {faux} for data simulation from descriptive statistics. I will also give several examples of how to use data simulation in teaching statistical concepts.

psyteachr.github.io

Why Simulate Data?

Reproducible Examples

Create reproducible examples when your data are too big or confidential to share

Talk: Replicability of results in the context of private non-sharable data

Pre-Registration

Prep analysis scripts for pre-registration or registered reports

Power

Calculate power and sensitivity for analyses that don’t have empirical methods

Enhance Understanding

Enhance your understanding of statistical or other complex concepts

Teaching Data

Create demo data for teaching and tutorials

Faux

Shiny App

Faux Code

sim_data <- faux::sim_design(
  within = list( version   = c(V1 = "Version 1",    V2 = "Version 2"), 
                 condition = c(ctl = "Control",     exp = "Experimental")),
  between = list(age_group = c(young = "Age 20-29", old = "Age 70-79")),
  n = 30,
  mu = c(100, 100, 100, 100, 100, 90, 110, 110),
  sd = 20,
  r = 0.5,
  dv = c(score = "Score"),
  id = c(id = "Subject ID"),
  vardesc = list(version   = "Task Version", 
                 condition = "Experiment Condition", 
                 age_group = "Age Group"),
  long = TRUE
)

Faux Design Parameters

age_group

version

condition

V1_ctl

V1_exp

V2_ctl

V2_exp

n

mu

sd

young

V1

ctl

1.0

0.5

0.5

0.5

30

100

20

young

V1

exp

0.5

1.0

0.5

0.5

30

100

20

young

V2

ctl

0.5

0.5

1.0

0.5

30

100

20

young

V2

exp

0.5

0.5

0.5

1.0

30

100

20

old

V1

ctl

1.0

0.5

0.5

0.5

30

100

20

old

V1

exp

0.5

1.0

0.5

0.5

30

90

20

old

V2

ctl

0.5

0.5

1.0

0.5

30

110

20

old

V2

exp

0.5

0.5

0.5

1.0

30

110

20

Faux Design Plot

sim_data |> get_design() |> plot()

Faux Data Plot

sim_data |> plot(geoms = c("violin", "pointrangeSE"))

Power Simulation: Replicate Data

sim_data <- faux::sim_design(
  within = list(version = c(V1 = "Version 1", V2 = "Version 2"), 
                condition = c(ctl = "Control", exp = "Experimental")),
  between = list(age_group = c(young = "Age 20-29", old = "Age 70-79")),
  n = 30,
  mu = c(100, 100, 100, 100, 100, 90, 110, 110),
  sd = 20,
  r = 0.5,
  dv = c(score = "Score"),
  id = c(id = "Subject ID"),
  vardesc = list(version = "Task Version", 
                 condition = "Experiment Condition", 
                 age_group = "Age Group"),
  long = TRUE,
  rep = 100
)

Power Simulation: Analysis Function

# setup options to avoid annoying afex message & run faster
afex::set_sum_contrasts()
afex::afex_options(include_aov = FALSE) 

analysis_func <- function(data) {
  a <- afex::aov_ez(
    id = "id", 
    dv = "score", 
    between = "age_group",
    within = c("version", "condition"),
    data = data)
  
  as_tibble(a$anova_table, rownames = "term") |>
    rename(p = `Pr(>F)`)
}

Power Simulation: Analysis Result

# test on first data set
analysis_func(sim_data$data[[1]])

term

num Df

den Df

MSE

F

ges

p

age_group

1

58

1,184.9

0.07

0.001

0.795

version

1

58

206.3

18.17

0.035

0.000

age_group:version

1

58

206.3

21.46

0.042

0.000

condition

1

58

169.6

2.05

0.003

0.158

age_group:condition

1

58

169.6

0.22

0.000

0.640

version:condition

1

58

197.3

4.60

0.009

0.036

age_group:version:condition

1

58

197.3

0.40

0.001

0.529

Power Simulation

power <- sim_data |>
  mutate(analysis = purrr::map(data, analysis_func)) |>
  select(-data) |>
  unnest(analysis) |>
  group_by(term) |>
  summarise(power = mean(p < .05))

term

power

age_group

0.10

age_group:condition

0.24

age_group:version

0.99

age_group:version:condition

0.24

condition

0.24

version

1.00

version:condition

0.30

Mixed Effects

Random factors

subj_n <- 2 # number of subjects

lmem_dat <- add_random(subj = subj_n)

subj

subj1

subj2

Crossed random factors

item_n <- 2 # number of items

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n)

subj

item

subj1

item1

subj1

item2

subj2

item1

subj2

item2

Nested random factors

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n, .nested_in = "subj")

subj

item

subj1

item1

subj1

item2

subj2

item3

subj2

item4

Fixed factors (within)

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental"))

subj

item

condition

subj1

item1

control

subj1

item1

experimental

subj1

item2

control

subj1

item2

experimental

subj2

item1

control

subj2

item1

experimental

subj2

item2

control

subj2

item2

experimental

Fixed factors (between)

gender_prob <- c(20, 75, 5) # gender category distribution

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob)

subj

item

condition

gender

subj1

item1

control

M

subj1

item1

experimental

M

subj1

item2

control

M

subj1

item2

experimental

M

subj2

item1

control

M

subj2

item1

experimental

M

subj2

item2

control

M

subj2

item2

experimental

M

Contrast Coding

My name Other names R function faux function
Treatment Treatment (2), Dummy (1, 4, 6), Simple (5) contr.treatment contr_code_treatment
Anova Deviation (2), Contrast (1), Simple (4) contr.treatment - 1/k contr_code_anova
Sum Sum (1, 2, 6), Effects (3), Deviation (4, 5), Unweighted Effects (7) contr.sum contr_code_sum
Difference Contrast (3), Forward/Backward (4), Repeated (5) MASS::contr.sdif contr_code_difference
Helmert Reverse Helmert (1, 4), Difference (5), Contrast (7) contr.helmert / (column_i+1) contr_code_helmert
Polynomial Polynomial (5), Orthogonal Polynomial (4), Trend (3) contr.poly contr_code_poly

Explainer

Codings

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "treat") |>
  add_contrast("condition", "anova", colnames = "anova") |>
  add_contrast("condition", "sum", colnames = "sum")

subj

item

condition

gender

treat

anova

sum

subj1

item1

control

M

0

-0.5

1

subj1

item1

experimental

M

1

0.5

-1

subj1

item2

control

M

0

-0.5

1

subj1

item2

experimental

M

1

0.5

-1

subj2

item1

control

F

0

-0.5

1

subj2

item1

experimental

F

1

0.5

-1

subj2

item2

control

F

0

-0.5

1

subj2

item2

experimental

F

1

0.5

-1

Design

# define this to make room later

design <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "cond")

subj

item

condition

gender

cond

subj1

item1

control

F

0

subj1

item1

experimental

F

1

subj1

item2

control

F

0

subj1

item2

experimental

F

1

subj2

item1

control

M

0

subj2

item1

experimental

M

1

subj2

item2

control

M

0

subj2

item2

experimental

M

1

Fixed Effects

intercept <- 100 # model intercept (mean for control condition)
cond_eff  <-  10 # condition effect size

lmem_dat <- design |>
  mutate(dv = intercept + 
           cond * cond_eff)

subj

item

condition

gender

cond

dv

subj1

item1

control

F

0

100

subj1

item1

experimental

F

1

110

subj1

item2

control

F

0

100

subj1

item2

experimental

F

1

110

subj2

item1

control

M

0

100

subj2

item1

experimental

M

1

110

subj2

item2

control

M

0

100

subj2

item2

experimental

M

1

110

Error Term

error_sd <- 20 # SD of trial-level error (residuals)

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  mutate(dv = intercept + err +
           cond * cond_eff)

subj

item

condition

gender

cond

err

dv

subj1

item1

control

F

0

-8.08

91.92

subj1

item1

experimental

F

1

-23.96

86.04

subj1

item2

control

F

0

0.23

100.23

subj1

item2

experimental

F

1

5.42

115.42

subj2

item1

control

M

0

1.00

101.00

subj2

item1

experimental

M

1

16.41

126.41

subj2

item2

control

M

0

-20.19

79.81

subj2

item2

experimental

M

1

-6.39

103.61

Random Intercepts

subj_sd      <-   8 # SD of subject-level intercepts
item_sd      <-   4 # SD of item-level intercepts

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", subj_i = subj_sd) |>
  add_ranef(.by = "item", item_i = item_sd) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * cond_eff)

subj

item

condition

gender

cond

err

subj_i

item_i

dv

subj1

item1

control

F

0

13.58

6.35

2.42

122.35

subj1

item1

experimental

F

1

5.94

6.35

2.42

124.70

subj1

item2

control

F

0

15.91

6.35

0.27

122.53

subj1

item2

experimental

F

1

16.86

6.35

0.27

133.48

subj2

item1

control

M

0

4.08

-12.49

2.42

94.01

subj2

item1

experimental

M

1

-16.53

-12.49

2.42

83.40

subj2

item2

control

M

0

-4.60

-12.49

0.27

83.19

subj2

item2

experimental

M

1

9.79

-12.49

0.27

107.57

Random Slopes

subj_cond_sd <-   5 # SD of subject-level condition effect size
item_cond_sd <-  15 # SD of item-level condition effect size
subj_cors    <-   0.5 # correlation between subject intercept and slope
item_cors    <-  -0.5 # correlation between item intercept and slope

lmem_dat <- design |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", 
            subj_i = subj_sd, 
            subj_cond = subj_cond_sd, 
            .cors = subj_cors) |>
  add_ranef(.by = "item", 
            item_i = item_sd, 
            item_cond = item_cond_sd, 
            .cors = item_cors) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * (cond_eff + subj_cond + item_cond))

Random Slopes

subj

item

condition

gender

cond

err

subj_i

subj_cond

item_i

item_cond

dv

subj1

item1

control

F

0

-8.13

3.30

5.51

-2.06

3.12

93.11

subj1

item1

experimental

F

1

27.74

3.30

5.51

-2.06

3.12

147.61

subj1

item2

control

F

0

30.25

3.30

5.51

-1.17

5.71

132.39

subj1

item2

experimental

F

1

-2.88

3.30

5.51

-1.17

5.71

120.48

subj2

item1

control

M

0

-57.82

7.53

-0.82

-2.06

3.12

47.65

subj2

item1

experimental

M

1

-31.04

7.53

-0.82

-2.06

3.12

86.73

subj2

item2

control

M

0

0.45

7.53

-0.82

-1.17

5.71

106.81

subj2

item2

experimental

M

1

-22.36

7.53

-0.82

-1.17

5.71

98.89

Parameters

subj_n       <- 100   # number of subjects
item_n       <-  20   # number of items
gender_prob  <- c(20, 75, 5) # gender category distribution
intercept    <- 100   # model intercept (mean for control condition)
cond_eff     <-  10   # condition effect size
error_sd     <-  20   # SD of trial-level error (residuals)
subj_sd      <-   8   # SD of subject-level intercepts
item_sd      <-   4   # SD of item-level intercepts
subj_cond_sd <-   5   # SD of subject-level condition effect size
item_cond_sd <-  15   # SD of item-level condition effect size
subj_cors    <-   0.5 # correlation between subject intercept and slope
item_cors    <-  -0.5 # correlation between item intercept and slope

Full Data Simulation Code

lmem_dat <- add_random(subj = subj_n) |>
  add_random(item = item_n) |>
  add_within(condition = c("control", "experimental")) |>
  add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
  add_contrast("condition", "treatment", colnames = "cond") |>
  add_ranef(err = error_sd) |>
  add_ranef(.by = "subj", 
            subj_i = subj_sd, 
            subj_cond = subj_cond_sd, 
            .cors = subj_cors) |>
  add_ranef(.by = "item", 
            item_i = item_sd, 
            item_cond = item_cond_sd, 
            .cors = item_cors) |>
  mutate(dv = intercept + subj_i + item_i + err +
           cond * (cond_eff + subj_cond + item_cond))

LMEM Analysis

lmer(dv ~ cond + 
       (1 + cond | subj) + 
       (1 + cond | item),
     data = lmem_dat) |> summary()
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: dv ~ cond + (1 + cond | subj) + (1 + cond | item)
   Data: lmem_dat

REML criterion at convergence: 35792.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4674 -0.6511 -0.0066  0.6492  3.6816 

Random effects:
 Groups   Name        Variance Std.Dev. Corr 
 subj     (Intercept)  48.191   6.942        
          cond         24.472   4.947   0.46 
 item     (Intercept)   7.951   2.820        
          cond        145.204  12.050   -0.07
 Residual             415.973  20.395        
Number of obs: 4000, groups:  subj, 100; item, 20

Fixed effects:
            Estimate Std. Error      df t value Pr(>|t|)    
(Intercept)  101.474      1.043  49.032  97.308  < 2e-16 ***
cond           9.509      2.814  20.202   3.379  0.00295 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
     (Intr)
cond -0.056

Power simulation

  1. Wrap dataset creation and analysis in a function that returns the values you care about as a data frame
  2. Iterate this function and save the output as a data frame
  3. Summarise the output (e.g., power, range of effect sizes)

Function Outline

sim_func <- function(iteration = 0, subj_n = 100, item_n = 20, cond_eff = 0) {
  # variables not in function arguments

  # simulate the data
  lmem_dat <- ...
  
  # run the analysis
  mod <- lmer(...)
  
  # return a table of fixed effects only
  broom.mixed::tidy(mod, effects = "fixed") |>
    mutate(iteration = iteration)
}

Full Function

sim_func <- function(iteration = 0, subj_n = 100, item_n = 20, cond_eff = 0) {
  # variables not in function arguments
  gender_prob  <- c(20, 75, 5)
  intercept    <- 100
  error_sd     <-  20
  subj_sd      <-   8
  item_sd      <-   4
  subj_cond_sd <-   5
  item_cond_sd <-  15
  subj_cors    <-   0.5
  item_cors    <-  -0.5

# simulate the data
  lmem_dat <- add_random(subj = subj_n) |>
    add_random(item = item_n) |>
    add_within(condition = c("control", "experimental")) |>
    add_between("subj", gender = c("M", "F", "NB"), .prob = gender_prob) |>
    add_contrast("condition", "treatment", colnames = "cond") |>
    add_ranef(err = error_sd) |>
    add_ranef(.by = "subj", 
              subj_i = subj_sd, 
              subj_cond = subj_cond_sd, 
              .cors = subj_cors) |>
    add_ranef(.by = "item", 
              item_i = item_sd, 
              item_cond = item_cond_sd, 
              .cors = item_cors) |>
    mutate(dv = intercept + subj_i + item_i + err +
             cond * (cond_eff + subj_cond + item_cond))
  
  # run the analysis
  mod <- lmer(dv ~ cond + 
       (1 + cond | subj) + 
       (1 + cond | item),
     data = lmem_dat)
  
  # return a table of fixed effects only
  broom.mixed::tidy(mod, effects = "fixed") |>
    mutate(iteration = iteration)
}

Test the Function

sim_func()

effect

term

estimate

std.error

statistic

df

p.value

fixed

(Intercept)

102.73

1.24

83.15

60.34

0.00

fixed

cond

-5.02

3.51

-1.43

19.51

0.17

# again to make sure iterations differ
sim_func()

effect

term

estimate

std.error

statistic

df

p.value

fixed

(Intercept)

100.28

1.25

80.14

53.40

0.00

fixed

cond

-1.78

4.10

-0.43

19.33

0.67

# change effect size
sim_func(cond_eff = 10)

effect

term

estimate

std.error

statistic

df

p.value

fixed

(Intercept)

100.09

1.22

81.90

61.96

0.00

fixed

cond

8.20

2.75

2.98

20.59

0.01

Iterate

set.seed(8675309)

sim_results <- map_df(1:10, sim_func, cond_eff = 10)

effect

term

estimate

std.error

statistic

df

p.value

iteration

fixed

(Intercept)

98.97

0.99

99.85

67.08

0.00

1

fixed

cond

11.69

3.18

3.67

20.22

0.00

1

fixed

(Intercept)

102.87

1.09

94.34

57.20

0.00

2

fixed

cond

7.64

2.85

2.68

19.88

0.01

2

fixed

(Intercept)

100.08

1.43

70.04

33.77

0.00

3

fixed

cond

6.06

4.42

1.37

19.22

0.19

3

Summarise

sim_results |>
  summarise(power = mean(p.value < 0.05), .by = term)

term

power

(Intercept)

1.0

cond

0.7

Further Resources

PsyPag Simulation Summer School

PsyPag Simulation Summer School

Data Simulation Workshops

Data Simulation Workshops

Thank You!

https://debruine.github.io/talks/Leuven-2025/

https://debruine.github.io/talks/Leuven-2025/

psyteachr.github.io

psyteachr.github.io