Appendix E — PsyArXiv

Setup

library(tidyverse) # for data wrangling
library(lubridate) # for handling dates
library(tidytext)  # for text analysis
library(ggraph)    # for plotting ngrams
library(magick)    # for cropping an image
library(grid)      # for raster

E.1 Data

PsyArXiv data from Day 27.

Code

preprint_data <- readRDS("data/preprint_data.RDS")

info <- map_df(preprint_data, function(p) {
  list(id = p$id,
       pub_dt = p$attributes$date_published,
       title = p$attributes$title,
       desc = p$attributes$description
       )
}) %>%
  mutate(pub_dt = as_datetime(pub_dt))

E.2 Ngrams

Here is a function that will extract any number of ngram.

Code

ngrams <- function(data, col, n = 2, sw = stop_words$word) {
  word_cols <- paste0("word_", 1:n)
  data %>%
    unnest_tokens(ngram, {{col}}, token = "ngrams", n = n) %>% 
    mutate(ngram_id = row_number()) %>%
    separate_rows(ngram, sep = " ") %>%
    filter(!ngram %in% sw) %>%
    group_by(ngram_id) %>%
    filter(n() == n) %>%
    mutate(word = word_cols) %>%
    ungroup() %>%
    pivot_wider(names_from = word, values_from = ngram) %>%
    count(across(all_of(word_cols)), sort = TRUE)
}

Look at the top 10 trigrams from a random 1000 entries.

Code

trigrams <- info %>%
  slice_sample(n = 1000) %>%
  ngrams(title, 3)

head(trigrams, 10)

word_1	word_2	word_3	n
covid	19	pandemic	24
randomized	controlled	trial	7
autism	spectrum	disorder	4
cross	lagged	panel	3
pilot	randomized	controlled	3
affective	task	switching	2
borderline	personality	disorder	2
cochlear	implant	listeners	2
cognitive	reflection	test	2
covid	19	lockdown	2

Calculate the bigrams for all titles.

Code

bigrams <- ngrams(info, title, 2)
head(bigrams, 10)

word_1	word_2	n
covid	19	1432
19	pandemic	549
mental	health	415
meta	analysis	406
systematic	review	302
individual	differences	217
social	media	125
emotion	regulation	108
virtual	reality	108
machine	learning	102

E.3 Plot

Use the code from Day 17 to make a bigram plot.

Code

bigrams %>%
  slice_max(order_by = n, n = 20) %>%
  ggraph(layout = "kk") +
  geom_edge_link(aes(width = n), color = "white", show.legend = FALSE) +
  geom_node_label(aes(label = name), vjust = 0.5, hjust = 0.5,
                  fill = "#CA1A31", color = "white", 
                  label.padding = unit(.5, "lines"),
                  label.r = unit(.75, "lines")) +
  coord_cartesian(clip = "off") +
  theme_void() +
  theme(plot.margin = unit(rep(.5, 4), "inches"),
        plot.background = element_rect(fill = "#012C4C"))

E.4 Add Image

Add the PsyArXiv logo using magick to crop the wikipedia logo and grid to rasterize it so it can be added as an annotation.

Process Image

#download.file("https://upload.wikimedia.org/wikipedia/commons/9/9c/PsyArXiv_logo.png", "images/psyarxiv.png")

# read and crop image
img <- magick::image_read("images/psyarxiv.png")
img_sq <- magick::image_crop(img, "174x174+4+4")
img_grob <- grid::rasterGrob(img_sq, interpolate=TRUE)

Plot Code

bigram_plot <- last_plot()

bigram_plot +
  annotation_custom(img_grob, 
                    xmin = -5, 
                    xmax =  -3, 
                    ymin = 1.2, 
                    ymax =  3.2)

The top 20 bigrams from PsyArXiv titles, with lines connecting them that show their popularity by width.