Tutorial

using Faux # for data simulation (under development!)
using DataFrames # for data wrangling
using CairoMakie # for plots
using GLM # to calculate regression lines (why doesn't Makie do this?)
using Statistics
import Random

rng = Random.seed!(8675309); # make randomness predictable :)

Random.TaskLocalRNG()

Simulate by Design

By default, sim_design() gives you a data frame with n = 100 observations of a single normally distributed variable called y, with mean = 0 and sd = 1. Here, we will simulate smaller n to make tables easier to see.

df = sim_design(n = 5)

5×2 DataFrame

Row	id	y
	String	Float64
1	id1	0.70632
2	id2	-0.366571
3	id3	-0.180916
4	id4	-0.684639
5	id5	-0.275842

You can change the name of the dependent variable or the id column.

df = sim_design(n = 5, dv = "score", id = "subj")

5×2 DataFrame

Row	subj	score
	String	Float64
1	subj1	0.46199
2	subj2	0.527764
3	subj3	1.66803
4	subj4	-0.112542
5	subj5	0.421035

You can add between-subject variables.

b = ["pet" => ["dog", "cat"]]
df = sim_design(n = 3, between = b)

6×3 DataFrame

Row	id	pet	y
	String	SubStrin…	Float64
1	id1	dog	-1.10909
2	id2	dog	0.167813
3	id3	dog	-0.623059
4	id4	cat	0.0341544
5	id5	cat	1.23148
6	id6	cat	-0.480946

And within-subject variables.

w = ["cond" => ["ctl", "exp"]]
df = sim_design(n = 3, within = w)

3×3 DataFrame

Row	id	ctl	exp
	String	Float64	Float64
1	id1	-0.0078768	0.310653
2	id2	1.04838	-1.33857
3	id3	0.937878	1.34539

You can return a long version of your data.

df = sim_design(n = 3, within = w, long = true)

6×3 DataFrame

Row	id	cond	y
	String	SubStrin…	Float64
1	id1	ctl	0.322081
2	id2	ctl	1.01367
3	id3	ctl	-0.526296
4	id1	exp	-0.552148
5	id2	exp	-0.625933
6	id3	exp	0.890368

Set the means, standard deviations, and correlations. Set empirical = true to set the sample parameters, rather than the population parameters.

b = ["pet" => ["dog", "cat"]]
w = ["cond" => ["ctl", "exp1", "exp2"]]
mu = Dict("dog" => Dict("ctl" => 10, "exp1" => 20, "exp2" => 30),
          "cat" => Dict("ctl" => 40, "exp1" => 50, "exp2" => 60))
sd = Dict("cat" => Dict("exp1" => 5, "exp2" => 6, "ctl" => 4),
          "dog" => Dict("ctl" => 1, "exp1" => 2, "exp2" => 3))
r = Dict("dog" => [.1, .2, .3],
         "cat" => [.4, .5, .6])

df = sim_design(n = 5, within = w, between = b,
                 mu = mu, sd = sd, r = r,
                 empirical = true)

10×5 DataFrame

Row	id	pet	ctl	exp1	exp2
	String	SubStrin…	Float64	Float64	Float64
1	id01	dog	9.49981	17.1466	27.2636
2	id02	dog	9.77781	22.0335	26.7213
3	id03	dog	8.76653	20.8465	33.002
4	id04	dog	11.3428	21.2126	32.9576
5	id05	dog	10.613	18.7608	30.0555
6	id06	cat	40.6515	52.9525	59.6213
7	id07	cat	44.7142	54.7053	61.7551
8	id08	cat	42.7929	47.1345	65.6898
9	id09	cat	35.5535	52.5761	62.9258
10	id10	cat	36.2879	42.6315	50.0079

dogs = filter(row -> row.pet == "dog", df)
get_params(dogs)

3×6 DataFrame

Row	var	ctl	exp1	exp2	mean	sd
	String	Float64	Float64	Float64	Float64	Float64
1	ctl	1.0	0.1	0.2	10.0	1.0
2	exp1	0.1	1.0	0.3	20.0	2.0
3	exp2	0.2	0.3	1.0	30.0	3.0

cats = filter(row -> row.pet == "cat", df)
get_params(cats)

3×6 DataFrame

Row	var	ctl	exp1	exp2	mean	sd
	String	Float64	Float64	Float64	Float64	Float64
1	ctl	1.0	0.4	0.5	40.0	4.0
2	exp1	0.4	1.0	0.6	50.0	5.0
3	exp2	0.5	0.6	1.0	60.0	6.0

df = sim_design(n = 1000,
                within = ["axis" => ["x", "y"]],
                mu = [0, 100],
                sd = [1, 10],
                r = 0.5);

# ugh, I want ggplot2 :(
f = Figure()
hist(f[1,1], df.x)
hist(f[1,2], df.y)
scatter(f[2,1],df.x, df.y, alpha = 0.25)
m = GLM.lm(@formula(y ~ x), df)
ablines!(f[2,1], coef(m)...)
f

Repeats

Add the rep argument to simulate multiple repeats.

df = sim_design(between = ["cond" => ["ctl", "exp"]],
                mu = [0, 0.25], rep = 1000, long = true)

Use a split-apply-combine pattern to run an analysis. First, define an analyss fucntion that takes a data frame as the only argument, and returns a data frame of analysis values.

# define an analysis function
function analysis(df)
    m = lm(@formula(y ~ cond), df)
    stats = coeftable(m)
    return DataFrame(stats)
end

# test the analysis function
analysis(df)

2×7 DataFrame

Row	Name	Coef.	Std. Error	t	Pr(>\|t\|)	Lower 95%	Upper 95%
	String	Float64	Float64	Float64	Float64	Float64	Float64
1	(Intercept)	0.001444	0.00315873	0.457144	0.647568	-0.00474704	0.00763503
2	cond: exp	0.24797	0.00446712	55.5099	0.0	0.239214	0.256725

Split the data by rep, run the analysis on each rep, and calculate summary stats like mean coefficient or power.

# split the data by rep
df_grp = groupby(df, :rep)

# run the analysis on each rep
analyses = combine(df_grp, analysis)

# calculate summary stats by factor
combine(groupby(analyses, :Name),
        x -> (
            power = mean(x[!, 6] .< 0.05),
            mean_coef = mean(x[!, 3]),
            sd_coef = std(x[!, 3])
        )
)

2×4 DataFrame

Row	Name	power	mean_coef	sd_coef
	String	Float64	Float64	Float64
1	(Intercept)	0.047	0.001444	0.0973973
2	cond: exp	0.411	0.24797	0.141287