PCA

R
pca
princomp
psych
factanal
Published

2017-06-26

Download the Rmd notebook for this example

Putting together this page made me realise I still don’t know anything about PCA and factor analysis.

I use the psych package for SPSS-style PCA.

library(tidyverse)
library(psych)
library(viridis)

First, I’ll simulate some data with an underlying structure of three factors.

set.seed(444)  # for reproducibility; delete when running simulations

a <- rnorm(100, 0, 1)
b <- rnorm(100, 0, 1)
c <- rnorm(100, 0, 1)

df <- data.frame(
  id = seq(1,100),
  a1 = a + rnorm(100, 0, 1),
  a2 = a + rnorm(100, 0, .8),
  a3 = a + rnorm(100, 0, .6),
  a4 = -a + rnorm(100, 0, .4),
  b1 = b + rnorm(100, 0, 1),
  b2 = b + rnorm(100, 0, .8),
  b3 = b + rnorm(100, 0, .6),
  b4 = -b + rnorm(100, 0, .4),
  c1 = c + rnorm(100, 0, 1),
  c2 = c + rnorm(100, 0, .8),
  c3 = c + rnorm(100, 0, .6),
  c4 = -c + rnorm(100, 0, .4)
)

Select just the columns you want for your PCA. You can visualise their correlations with cor() and ggplot().

traits <- df %>% select(-id) 

traits %>%
  cor() %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", a1:c4) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(-1, 1))

Determine the number of factors to extract. Here I use the SPSS-style default criterion of Eigenvalues > 1

ev <- eigen(cor(traits));
nfactors <- length(ev$values[ev$values > 1]);
nfactors
[1] 3

Principal components analysis (SPSS-style)

principal(rotation = “none”)

traits.principal <- principal(traits, nfactors=nfactors, rotate="none", scores=TRUE)
traits.principal
Principal Components Analysis
Call: principal(r = traits, nfactors = nfactors, rotate = "none", scores = TRUE)
Standardized loadings (pattern matrix) based upon correlation matrix
     PC1   PC2   PC3   h2   u2 com
a1 -0.63  0.24  0.42 0.63 0.37 2.1
a2 -0.72  0.12  0.50 0.77 0.23 1.8
a3 -0.65  0.26  0.55 0.79 0.21 2.3
a4  0.73 -0.23 -0.49 0.83 0.17 2.0
b1  0.14  0.75 -0.19 0.62 0.38 1.2
b2  0.09  0.83 -0.16 0.71 0.29 1.1
b3  0.10  0.89 -0.16 0.83 0.17 1.1
b4 -0.12 -0.88  0.15 0.81 0.19 1.1
c1  0.64  0.04  0.50 0.66 0.34 1.9
c2  0.77  0.10  0.42 0.78 0.22 1.6
c3  0.70  0.08  0.54 0.79 0.21 1.9
c4 -0.80 -0.04 -0.49 0.88 0.12 1.7

                       PC1  PC2  PC3
SS loadings           4.05 3.02 2.03
Proportion Var        0.34 0.25 0.17
Cumulative Var        0.34 0.59 0.76
Proportion Explained  0.45 0.33 0.22
Cumulative Proportion 0.45 0.78 1.00

Mean item complexity =  1.6
Test of the hypothesis that 3 components are sufficient.

The root mean square of the residuals (RMSR) is  0.05 
 with the empirical chi square  33.59  with prob <  0.44 

Fit based upon off diagonal values = 0.98
scores.principal <- traits.principal$scores
cor(scores.principal, traits) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", a1:c4) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(-1, 1))

principal(rotation = “varimax”)

traits.varimax <- principal(traits, nfactors=nfactors, rotate="varimax", scores=TRUE)
traits.varimax
Principal Components Analysis
Call: principal(r = traits, nfactors = nfactors, rotate = "varimax", 
    scores = TRUE)
Standardized loadings (pattern matrix) based upon correlation matrix
     RC1   RC3   RC2   h2   u2 com
a1 -0.15  0.78  0.06 0.63 0.37 1.1
a2 -0.16  0.86 -0.09 0.77 0.23 1.1
a3 -0.07  0.89  0.05 0.79 0.21 1.0
a4  0.17 -0.90 -0.02 0.83 0.17 1.1
b1  0.03 -0.05  0.79 0.62 0.38 1.0
b2  0.02  0.03  0.84 0.71 0.29 1.0
b3  0.03  0.05  0.91 0.83 0.17 1.0
b4 -0.05 -0.03 -0.90 0.81 0.19 1.0
c1  0.81 -0.08  0.00 0.66 0.34 1.0
c2  0.85 -0.21  0.09 0.78 0.22 1.1
c3  0.88 -0.09  0.04 0.79 0.21 1.0
c4 -0.92  0.20 -0.02 0.88 0.12 1.1

                       RC1  RC3  RC2
SS loadings           3.09 3.03 2.98
Proportion Var        0.26 0.25 0.25
Cumulative Var        0.26 0.51 0.76
Proportion Explained  0.34 0.33 0.33
Cumulative Proportion 0.34 0.67 1.00

Mean item complexity =  1
Test of the hypothesis that 3 components are sufficient.

The root mean square of the residuals (RMSR) is  0.05 
 with the empirical chi square  33.59  with prob <  0.44 

Fit based upon off diagonal values = 0.98
scores.varimax <- traits.varimax$scores
cor(scores.varimax, traits) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", a1:c4) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(-1, 1))

Here are some other functions for PCA/Factor Analysis

princomp()

traits.princomp <- princomp(traits)
traits.princomp$loadings[,1:nfactors]
        Comp.1       Comp.2      Comp.3
a1  0.33006521  0.181709692  0.45906811
a2  0.29151833  0.067518916  0.38086462
a3  0.23950762  0.132057875  0.38289101
a4 -0.25823681 -0.112938603 -0.32926216
b1 -0.10631062  0.507810878 -0.12639281
b2 -0.07370167  0.500188081 -0.09771759
b3 -0.06894907  0.491029731 -0.08032005
b4  0.07460353 -0.426344256  0.07374535
c1 -0.44100481 -0.021914387  0.38825383
c2 -0.42646359  0.010511567  0.23488306
c3 -0.35757417 -0.001802824  0.29574993
c4  0.38827130  0.026358392 -0.24163371
scores.princomp <- traits.princomp$scores[,1:nfactors]
cor(scores.princomp, traits) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", a1:c4) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(-1, 1))

factanal(rotation = “none”)

traits.fa <- factanal(traits, nfactors, rotation="none", scores="regression")
print(traits.fa, digits=2, cutoff=0, sort=FALSE)

Call:
factanal(x = traits, factors = nfactors, scores = "regression",     rotation = "none")

Uniquenesses:
  a1   a2   a3   a4   b1   b2   b3   b4   c1   c2   c3   c4 
0.52 0.32 0.26 0.18 0.53 0.40 0.18 0.23 0.50 0.27 0.32 0.08 

Loadings:
   Factor1 Factor2 Factor3
a1 -0.46    0.26    0.45  
a2 -0.55    0.17    0.60  
a3 -0.47    0.32    0.64  
a4  0.57   -0.30   -0.64  
b1  0.09    0.63   -0.25  
b2  0.07    0.74   -0.21  
b3  0.08    0.87   -0.24  
b4 -0.10   -0.84    0.24  
c1  0.66    0.03    0.25  
c2  0.83    0.09    0.19  
c3  0.77    0.08    0.29  
c4 -0.92   -0.05   -0.28  

               Factor1 Factor2 Factor3
SS loadings       3.65    2.71    1.86
Proportion Var    0.30    0.23    0.16
Cumulative Var    0.30    0.53    0.68

Test of the hypothesis that 3 factors are sufficient.
The chi square statistic is 22.21 on 33 degrees of freedom.
The p-value is 0.923 
scores.fa <- traits.fa$scores
cor(scores.fa, traits) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", a1:c4) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(-1, 1))

factanal(rotation = “varimax”)

traits.fa.vm <- factanal(traits, nfactors, rotation="varimax", scores="regression")
print(traits.fa.vm, digits=2, cutoff=0, sort=FALSE)

Call:
factanal(x = traits, factors = nfactors, scores = "regression",     rotation = "varimax")

Uniquenesses:
  a1   a2   a3   a4   b1   b2   b3   b4   c1   c2   c3   c4 
0.52 0.32 0.26 0.18 0.53 0.40 0.18 0.23 0.50 0.27 0.32 0.08 

Loadings:
   Factor1 Factor2 Factor3
a1 -0.16    0.67    0.06  
a2 -0.18    0.80   -0.08  
a3 -0.08    0.85    0.05  
a4  0.17   -0.89   -0.03  
b1  0.02   -0.04    0.68  
b2  0.03    0.04    0.77  
b3  0.04    0.05    0.91  
b4 -0.05   -0.03   -0.87  
c1  0.70   -0.10    0.00  
c2  0.82   -0.21    0.09  
c3  0.82   -0.11    0.04  
c4 -0.94    0.19   -0.02  

               Factor1 Factor2 Factor3
SS loadings       2.82    2.73    2.67
Proportion Var    0.23    0.23    0.22
Cumulative Var    0.23    0.46    0.68

Test of the hypothesis that 3 factors are sufficient.
The chi square statistic is 22.21 on 33 degrees of freedom.
The p-value is 0.923 
scores.fa.vm <- traits.fa.vm$scores
cor(scores.fa.vm, traits) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", a1:c4) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(-1, 1))

How do they compare?

Here, I’ll plot the absolute value of all the correlations (since the sign on factors/PCs is arbitrary).

The functions principal(rotation = “varimax”) and factanal(rotation = “varimax”) are nearly (but not perfectly) identical.

scores.fa <- traits.fa$scores
colnames(scores.principal) <- c("principal() 1", "principal() 2", "principal() 3")
colnames(scores.varimax) <- c("principal(vm) 1", "principal(vm) 2", "principal(vm) 3")
colnames(scores.princomp) <- c("princomp() 1", "princomp() 2", "princomp() 3")
colnames(scores.fa) <- c("factanal() 1", "factanal() 2", "factanal() 3")
colnames(scores.fa.vm) <- c("factanal(vm) 1", "factanal(vm) 2", "factanal(vm) 3")

cbind(
  scores.princomp,
  scores.principal,
  scores.fa,
  scores.varimax,
  scores.fa.vm
) %>% 
  cor() %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather("var2", "value", 1:15) %>%
  mutate(var1 = factor(var1), var1 = factor(var1, levels = rev(levels(var1)))) %>%
  mutate(value = abs(value)) %>%
  ggplot(aes(var2, var1, fill=value)) +
  geom_tile() +
  scale_x_discrete(position = "top") +
  xlab("") + ylab("") +
  scale_fill_viridis(limits=c(0, 1)) +
  geom_hline(yintercept = 3.5, color="white") +
  geom_hline(yintercept = 6.5, color="white") +
  geom_hline(yintercept = 9.5, color="white") +
  geom_hline(yintercept = 12.5, color="white") +
  geom_vline(xintercept = 3.5, color="white") +
  geom_vline(xintercept = 6.5, color="white") +
  geom_vline(xintercept = 9.5, color="white") +
  geom_vline(xintercept = 12.5, color="white") +
  theme(axis.text.x=element_text(angle=90,hjust=1))