Formative Exercise 05: MSc Data Skills Course

Edit the code chunks below and knit the document. You can pipe your objects to glimpse() or print() to display them.

UK Baby Names

Here we will convert the data table from the ukbabynames package to a tibble and assign it the variable name ukb. Use this data tibble for questions 1-13.

# do not alter this code chunk
ukb <- as_tibble(ukbabynames) # convert to a tibble

Question 1

How many records are in the dataset?

nrecords <- nrow(ukb)

## or: 

nrecords <- count(ukb) %>% pull(n) %>% print()

## [1] 227449

Question 2

Remove the column rank from the dataset.

norank <- ukb %>%
  select(-rank) %>%
  glimpse()

## Rows: 227,449
## Columns: 4
## $ year <dbl> 1996, 1996, 1996, 1996, 1996, 1996, 1996, 199…
## $ sex  <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Sophie", "Chloe", "Jessica", "Emily", "Laure…
## $ n    <dbl> 7087, 6824, 6711, 6415, 6299, 5916, 5866, 582…

Question 3

What is the range of birth years contained in the dataset? Use summarise to make a table with two columns: minyear and maxyear.

birth_range <- ukb %>%
  summarise(minyear = min(year),
            maxyear = max(year)) %>%
  print()

## # A tibble: 1 x 2
##   minyear maxyear
##     <dbl>   <dbl>
## 1    1996    2015

Question 4

Make a table of only the data from babies named Hermione.

hermiones <- ukb %>%
  filter(name == "Hermione") %>%
  print()

## # A tibble: 20 x 5
##     year sex   name         n  rank
##    <dbl> <chr> <chr>    <dbl> <dbl>
##  1  1996 F     Hermione    21   974
##  2  1997 F     Hermione    29   789
##  3  1998 F     Hermione    40   628
##  4  1999 F     Hermione    35   678
##  5  2000 F     Hermione    40   637
##  6  2001 F     Hermione    52   540
##  7  2002 F     Hermione    84   394
##  8  2003 F     Hermione   158   265
##  9  2004 F     Hermione   162   265
## 10  2005 F     Hermione   122   331
## 11  2006 F     Hermione   118   359
## 12  2007 F     Hermione   109   385
## 13  2008 F     Hermione   129   348
## 14  2009 F     Hermione   116   370
## 15  2010 F     Hermione   111   398
## 16  2011 F     Hermione   118   392
## 17  2012 F     Hermione    97   465
## 18  2013 F     Hermione    77   542
## 19  2014 F     Hermione    85   500
## 20  2015 F     Hermione    79   549

Question 5

Sort the dataset by sex and then by year (descending) and then by rank (descending).

sorted_babies <- ukb %>%
  arrange(sex, desc(year), desc(rank)) %>%
  glimpse()

## Rows: 227,449
## Columns: 5
## $ year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ sex  <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Aabidah", "Aabish", "Aaeesha", "Aafreen", "A…
## $ n    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ rank <dbl> 5730, 5730, 5730, 5730, 5730, 5730, 5730, 573…

Question 6

Create a new column, decade, that contains the decade of birth (1990, 2000, 2010). Hint: see ?floor

ukb_decade <- ukb %>%
  mutate(decade = floor(year / 10) * 10) %>%
  glimpse()

## Rows: 227,449
## Columns: 6
## $ year   <dbl> 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1…
## $ sex    <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F"…
## $ name   <chr> "Sophie", "Chloe", "Jessica", "Emily", "Lau…
## $ n      <dbl> 7087, 6824, 6711, 6415, 6299, 5916, 5866, 5…
## $ rank   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …
## $ decade <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1…

Question 7

Make a table of only the data from male babies named Courtney that were born between 1998 and 2001 (inclusive).

courtney <- ukb %>%
  filter(name == "Courtney", sex == "M",
         year >= 1998, year <= 2001) %>%
  print()

## # A tibble: 4 x 5
##    year sex   name         n  rank
##   <dbl> <chr> <chr>    <dbl> <dbl>
## 1  1998 M     Courtney    33   554
## 2  1999 M     Courtney    15   973
## 3  2000 M     Courtney    19   848
## 4  2001 M     Courtney    22   786

Question 8

How many distinct names are represented in the dataset? Make sure distinct_names is an integer, not a data table.

distinct_names <- n_distinct(ukb$name)

# or

distinct_names <- ukb %>%
  distinct(name) %>%
  count() %>%
  pull() %>%
  print()

## [1] 31272

Question 9

Make a table of only the data from the female babies named Frankie that were born before 1999 or after 2010.

frankie <- ukb %>%
  filter(name == "Frankie", 
         sex == "F",
         (year < 1999) | (year > 2010)) %>%
  print()

## # A tibble: 8 x 5
##    year sex   name        n  rank
##   <dbl> <chr> <chr>   <dbl> <dbl>
## 1  1996 F     Frankie    34   686
## 2  1997 F     Frankie    48   545
## 3  1998 F     Frankie    41   619
## 4  2011 F     Frankie   337   156
## 5  2012 F     Frankie   298   173
## 6  2013 F     Frankie   273   195
## 7  2014 F     Frankie   344   162
## 8  2015 F     Frankie   376   147

Question 10

How many total babies in the dataset were named ‘Emily’? Make sure emily is an integer, not a data table.

emily <- ukb %>%
  filter(name == "Emily") %>%
  summarise(tot = sum(n)) %>%
  pull(tot) %>%
  print()

## [1] 102250

Question 11

How many distinct names are there for each sex?

names_per_sex <- ukb %>% 
  group_by(sex) %>%
  distinct(name) %>%
  count %>%
  print()

## # A tibble: 2 x 2
## # Groups:   sex [2]
##   sex       n
##   <chr> <int>
## 1 F     18823
## 2 M     14378

Question 12

What is the most popular name in the dataset?

most_popular <- ukb %>%
  group_by(name) %>%
  summarise(tot = sum(n), .groups = "drop") %>%
  arrange(desc(tot)) %>%
  slice(1) %>%
  pull(name)

## alternatively, this will give you all the top names if there are ties
most_popular <- ukb %>%
  group_by(name) %>%
  summarise(tot = sum(n), .groups = "drop") %>%
  filter(rank(tot) == max(rank(tot))) %>%
  pull(name) %>%
  print()

## [1] "Jack"

Question 13

How many babies were born each year for each sex? Make a plot.

babies_per_year <- ukb %>%
  group_by(year, sex) %>%
  summarise(tot = sum(n), .groups = "drop")

ggplot(babies_per_year, aes(year, tot, color = sex)) +
  geom_line()

Select helpers

Load the dataset dataskills::personality.

Select only the personality question columns (not the user_id or date).

q_only <- dataskills::personality %>%
  select(-user_id, -date) %>%
  glimpse()

## Rows: 15,000
## Columns: 41
## $ Op1  <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, 6, …
## $ Ne1  <dbl> 4, 0, 0, 4, 1, 2, 3, 4, 0, 3, 3, 3, 2, 1, 1, …
## $ Ne2  <dbl> 0, 6, 6, 4, 2, 1, 2, 3, 1, 2, 5, 5, 3, 1, 1, …
## $ Op2  <dbl> 6, 0, 0, 4, 6, 4, 4, 0, 0, 3, 4, 3, 3, 4, 5, …
## $ Ex1  <dbl> 3, 0, 0, 2, 2, 4, 4, 3, 5, 4, 1, 1, 3, 3, 1, …
## $ Ex2  <dbl> 3, 0, 0, 3, 3, 4, 5, 2, 5, 3, 4, 1, 3, 2, 1, …
## $ Co1  <dbl> 3, 0, 0, 3, 5, 4, 3, 4, 5, 3, 3, 3, 1, 5, 5, …
## $ Co2  <dbl> 3, 0, 0, 3, 4, 3, 3, 4, 5, 3, 5, 3, 3, 4, 5, …
## $ Ne3  <dbl> 0, 0, 0, 1, 0, 1, 4, 4, 0, 4, 2, 5, 1, 2, 5, …
## $ Ag1  <dbl> 2, 0, 0, 4, 6, 5, 5, 4, 2, 5, 4, 3, 2, 4, 5, …
## $ Ag2  <dbl> 1, 6, 6, 0, 5, 4, 5, 3, 4, 3, 5, 1, 5, 4, 2, …
## $ Ne4  <dbl> 3, 6, 6, 2, 3, 2, 3, 3, 0, 4, 4, 5, 5, 4, 5, …
## $ Ex3  <dbl> 3, 6, 5, 5, 3, 3, 3, 0, 6, 1, 4, 2, 3, 2, 1, …
## $ Co3  <dbl> 2, 0, 1, 3, 4, 4, 5, 4, 5, 3, 4, 3, 4, 4, 5, …
## $ Op3  <dbl> 2, 6, 5, 5, 5, 4, 3, 2, 4, 3, 3, 6, 5, 5, 6, …
## $ Ex4  <dbl> 1, 0, 1, 3, 3, 3, 4, 3, 5, 3, 2, 0, 3, 3, 1, …
## $ Op4  <dbl> 3, 0, 1, 6, 6, 3, 3, 0, 6, 3, 4, 5, 4, 5, 6, …
## $ Ex5  <dbl> 3, 0, 1, 6, 3, 3, 4, 2, 5, 2, 2, 4, 2, 3, 0, …
## $ Ag3  <dbl> 1, 0, 1, 1, 0, 4, 4, 4, 3, 3, 4, 4, 3, 4, 4, …
## $ Co4  <dbl> 3, 6, 5, 5, 5, 3, 2, 4, 3, 1, 4, 3, 1, 2, 4, …
## $ Co5  <dbl> 0, 6, 5, 5, 5, 3, 3, 1, 5, 1, 2, 4, 4, 4, 2, …
## $ Ne5  <dbl> 3, 0, 1, 4, 1, 1, 4, 5, 0, 3, 4, 6, 2, 0, 1, …
## $ Op5  <dbl> 6, 6, 5, 2, 5, 4, 3, 2, 6, 6, 2, 4, 3, 4, 6, …
## $ Ag4  <dbl> 1, 0, 1, 4, 6, 5, 5, 6, 6, 6, 4, 2, 4, 5, 4, …
## $ Op6  <dbl> 0, 6, 5, 1, 6, 4, 6, 0, 0, 3, 5, 3, 5, 5, 5, …
## $ Co6  <dbl> 6, 0, 1, 4, 6, 5, 6, 5, 4, 3, 5, 5, 4, 6, 6, …
## $ Ex6  <dbl> 3, 6, 5, 3, 0, 4, 3, 1, 6, 3, 2, 1, 4, 2, 1, …
## $ Ne6  <dbl> 1, 6, 5, 1, 0, 1, 3, 4, 0, 4, 4, 5, 2, 1, 5, …
## $ Co7  <dbl> 3, 6, 5, 1, 3, 4, NA, 2, 3, 3, 2, 2, 4, 2, 5,…
## $ Ag5  <dbl> 3, 6, 5, 0, 2, 5, 6, 2, 2, 3, 4, 1, 3, 5, 2, …
## $ Co8  <dbl> 3, 0, 1, 1, 3, 4, 3, 0, 1, 3, 2, 2, 1, 2, 4, …
## $ Ex7  <dbl> 3, 6, 5, 4, 1, 2, 5, 3, 6, 3, 4, 3, 5, 1, 1, …
## $ Ne7  <dbl> NA, 0, 1, 2, 0, 2, 4, 4, 0, 3, 2, 5, 1, 2, 5,…
## $ Co9  <dbl> 3, 6, 5, 4, 3, 4, 5, 3, 5, 3, 4, 3, 4, 4, 2, …
## $ Op7  <dbl> 0, 6, 5, 5, 5, 4, 6, 2, 1, 3, 2, 4, 5, 5, 6, …
## $ Ne8  <dbl> 2, 0, 1, 1, 1, 1, 5, 4, 0, 4, 4, 5, 1, 2, 5, …
## $ Ag6  <dbl> NA, 6, 5, 2, 3, 4, 5, 6, 1, 3, 4, 2, 3, 5, 1,…
## $ Ag7  <dbl> 3, 0, 1, 1, 1, 3, 3, 5, 0, 3, 2, 1, 2, 3, 5, …
## $ Co10 <dbl> 1, 6, 5, 5, 3, 5, 1, 2, 5, 2, 4, 3, 4, 4, 3, …
## $ Ex8  <dbl> 2, 0, 1, 4, 3, 4, 2, 4, 6, 2, 4, 0, 4, 4, 1, …
## $ Ex9  <dbl> 4, 6, 5, 5, 5, 2, 3, 3, 6, 3, 3, 4, 4, 3, 2, …

Select the user_id column and all of the columns with questions about openness.

openness <- dataskills::personality %>%
  select(user_id, starts_with("Op")) %>%
  glimpse()

## Rows: 15,000
## Columns: 8
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 68…
## $ Op1     <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, …
## $ Op2     <dbl> 6, 0, 0, 4, 6, 4, 4, 0, 0, 3, 4, 3, 3, 4, …
## $ Op3     <dbl> 2, 6, 5, 5, 5, 4, 3, 2, 4, 3, 3, 6, 5, 5, …
## $ Op4     <dbl> 3, 0, 1, 6, 6, 3, 3, 0, 6, 3, 4, 5, 4, 5, …
## $ Op5     <dbl> 6, 6, 5, 2, 5, 4, 3, 2, 6, 6, 2, 4, 3, 4, …
## $ Op6     <dbl> 0, 6, 5, 1, 6, 4, 6, 0, 0, 3, 5, 3, 5, 5, …
## $ Op7     <dbl> 0, 6, 5, 5, 5, 4, 6, 2, 1, 3, 2, 4, 5, 5, …

Select the user_id column and all of the columns with the first question for each personality trait.

q1 <- dataskills::personality %>%
  select(user_id, ends_with("1")) %>%
  glimpse()

## Rows: 15,000
## Columns: 6
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 68…
## $ Op1     <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, …
## $ Ne1     <dbl> 4, 0, 0, 4, 1, 2, 3, 4, 0, 3, 3, 3, 2, 1, …
## $ Ex1     <dbl> 3, 0, 0, 2, 2, 4, 4, 3, 5, 4, 1, 1, 3, 3, …
## $ Co1     <dbl> 3, 0, 0, 3, 5, 4, 3, 4, 5, 3, 3, 3, 1, 5, …
## $ Ag1     <dbl> 2, 0, 0, 4, 6, 5, 5, 4, 2, 5, 4, 3, 2, 4, …

Window fuctions

The code below sets up a fake dataset where 10 subjects respond to 20 trials with a dv on a 5-point Likert scale.

set.seed(10)

fake_data <- tibble(
  subj_id = rep(1:10, each = 20),
  trial = rep(1:20, times = 10),
  dv = sample.int(5, 10*20, TRUE)
)

Question 14

You want to know how many times each subject responded with the same dv as their last trial. For example, if someone responded 2,3,3,3,4 for five trials they would have repeated their last response on the third and fourth trials. Use an offset function to determine how many times each subject repeated a response.

repeated_data <- fake_data %>%
  group_by(subj_id) %>%
  mutate(repeated = dv == lag(dv)) %>%
  summarise(repeats = sum(repeated, na.rm = TRUE),
            .groups = "drop") %>%
  print()

## # A tibble: 10 x 2
##    subj_id repeats
##      <int>   <int>
##  1       1       4
##  2       2       3
##  3       3       6
##  4       4       4
##  5       5       5
##  6       6       2
##  7       7       3
##  8       8       4
##  9       9       5
## 10      10       4

Question 15

Create a table too_many_repeats with the top two repeaters from repeated_data (and anyone who might be tied with them) using ranking functions.

too_many_repeats <- repeated_data %>%
  mutate(rank = dense_rank(repeats)) %>%
  filter(rank == max(rank) | rank == max(rank)-1) %>%
  print()

## # A tibble: 3 x 3
##   subj_id repeats  rank
##     <int>   <int> <int>
## 1       3       6     5
## 2       5       5     4
## 3       9       5     4

Advanced Questions

There are several ways to complete the following two tasks. Different people will solve them different ways, but you should be able to tell if your answers make sense.

Question 16

Load the dataset dataskills::family_composition from last week’s exercise.

Calculate how many siblings of each sex each person has, narrow the dataset down to people with fewer than 6 siblings, and generate at least two different ways to graph this.

family <- dataskills::family_composition %>%
  gather("sibtype", "n", oldbro:twinsis) %>%
  separate(sibtype, c("sibage", "sibsex"), sep = -3)

sib6 <- family %>%
  group_by(user_id, sex, sibsex) %>%
  summarise(n = sum(n), .groups = "drop") %>%
  group_by(user_id) %>%
  filter(sex %in% c("male", "female"), sum(n) < 6)

ggplot(sib6, aes(n, fill = sibsex)) +
  geom_histogram(binwidth = 1, colour = "black", position = "dodge")

sib6 <- family %>%
  group_by(user_id, sex, sibsex) %>%
  summarise(n = sum(n), .groups = "drop") %>%
  filter(sex %in% c("male", "female")) %>%
  spread(sibsex, n) %>%
  filter(bro + sis < 6)

ggplot(sib6, aes(bro, sis)) +
  geom_count()

# or
ggplot(sib6, aes(bro, sis)) +
  geom_bin2d(binwidth = c(1,1)) +
  stat_bin2d(geom = "text", aes(label = ..count..), 
             binwidth = c(1, 1), color = "white")

Question 17

Use the dataset dataskills::eye_descriptions from last week’s exercise.

Create a list of the 10 most common descriptions from the eyes dataset. Remove useless descriptions and merge redundant descriptions.

eyes <- dataskills::eye_descriptions %>%
  gather("face_id", "description", t1:t50) %>%
  separate(description, c("d1", "d2", "d3", "d4"), sep = "(,|;|\\/)+", extra = "merge", fill = "right") %>%
  gather("desc_n", "description", d1:d4) %>%
  filter(!is.na(description)) %>%          # gets rid of rows with no description
  mutate(
    description = trimws(description),     # get rid of white space around string
    description = tolower(description)     # make all characters lowercase
  ) %>%
  group_by(description) %>%
  summarise(n = n(), .groups = "drop") %>% # count occurrences of each description
  arrange(desc(n)) %>%                     # sort by count (descending)
  filter(nchar(description) > 1) %>%       # get rid of 1-character descriptions
  filter(row_number() < 11)

# displays the table in a nice format
knitr::kable(eyes)

description	n
brown	364
blue	314
small	276
pretty	261
big	240
round	233
sad	225
tired	219
dark	190
average	176