Edit the code chunks below and knit the document. You can pipe your objects to glimpse()
or print()
to display them.
Here we will convert the data table from the ukbabynames package to a tibble and assign it the variable name ukb
. Use this data tibble for questions 1-13.
# do not alter this code chunk
ukb <- as_tibble(ukbabynames) # convert to a tibble
How many records are in the dataset?
nrecords <- nrow(ukb)
## or:
nrecords <- count(ukb) %>% pull(n) %>% print()
## [1] 227449
Remove the column rank
from the dataset.
norank <- ukb %>%
select(-rank) %>%
glimpse()
## Rows: 227,449
## Columns: 4
## $ year <dbl> 1996, 1996, 1996, 1996, 1996, 1996, 1996, 199…
## $ sex <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Sophie", "Chloe", "Jessica", "Emily", "Laure…
## $ n <dbl> 7087, 6824, 6711, 6415, 6299, 5916, 5866, 582…
What is the range of birth years contained in the dataset? Use summarise
to make a table with two columns: minyear
and maxyear
.
birth_range <- ukb %>%
summarise(minyear = min(year),
maxyear = max(year)) %>%
print()
## # A tibble: 1 x 2
## minyear maxyear
## <dbl> <dbl>
## 1 1996 2015
Make a table of only the data from babies named Hermione.
hermiones <- ukb %>%
filter(name == "Hermione") %>%
print()
## # A tibble: 20 x 5
## year sex name n rank
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1996 F Hermione 21 974
## 2 1997 F Hermione 29 789
## 3 1998 F Hermione 40 628
## 4 1999 F Hermione 35 678
## 5 2000 F Hermione 40 637
## 6 2001 F Hermione 52 540
## 7 2002 F Hermione 84 394
## 8 2003 F Hermione 158 265
## 9 2004 F Hermione 162 265
## 10 2005 F Hermione 122 331
## 11 2006 F Hermione 118 359
## 12 2007 F Hermione 109 385
## 13 2008 F Hermione 129 348
## 14 2009 F Hermione 116 370
## 15 2010 F Hermione 111 398
## 16 2011 F Hermione 118 392
## 17 2012 F Hermione 97 465
## 18 2013 F Hermione 77 542
## 19 2014 F Hermione 85 500
## 20 2015 F Hermione 79 549
Sort the dataset by sex and then by year (descending) and then by rank (descending).
sorted_babies <- ukb %>%
arrange(sex, desc(year), desc(rank)) %>%
glimpse()
## Rows: 227,449
## Columns: 5
## $ year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ sex <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Aabidah", "Aabish", "Aaeesha", "Aafreen", "A…
## $ n <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ rank <dbl> 5730, 5730, 5730, 5730, 5730, 5730, 5730, 573…
Create a new column, decade
, that contains the decade of birth (1990, 2000, 2010). Hint: see ?floor
ukb_decade <- ukb %>%
mutate(decade = floor(year / 10) * 10) %>%
glimpse()
## Rows: 227,449
## Columns: 6
## $ year <dbl> 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1…
## $ sex <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F"…
## $ name <chr> "Sophie", "Chloe", "Jessica", "Emily", "Lau…
## $ n <dbl> 7087, 6824, 6711, 6415, 6299, 5916, 5866, 5…
## $ rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …
## $ decade <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1…
Make a table of only the data from male babies named Courtney that were born between 1998 and 2001 (inclusive).
courtney <- ukb %>%
filter(name == "Courtney", sex == "M",
year >= 1998, year <= 2001) %>%
print()
## # A tibble: 4 x 5
## year sex name n rank
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1998 M Courtney 33 554
## 2 1999 M Courtney 15 973
## 3 2000 M Courtney 19 848
## 4 2001 M Courtney 22 786
How many distinct names are represented in the dataset? Make sure distinct_names
is an integer, not a data table.
distinct_names <- n_distinct(ukb$name)
# or
distinct_names <- ukb %>%
distinct(name) %>%
count() %>%
pull() %>%
print()
## [1] 31272
Make a table of only the data from the female babies named Frankie that were born before 1999 or after 2010.
frankie <- ukb %>%
filter(name == "Frankie",
sex == "F",
(year < 1999) | (year > 2010)) %>%
print()
## # A tibble: 8 x 5
## year sex name n rank
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1996 F Frankie 34 686
## 2 1997 F Frankie 48 545
## 3 1998 F Frankie 41 619
## 4 2011 F Frankie 337 156
## 5 2012 F Frankie 298 173
## 6 2013 F Frankie 273 195
## 7 2014 F Frankie 344 162
## 8 2015 F Frankie 376 147
How many total babies in the dataset were named ‘Emily’? Make sure emily
is an integer, not a data table.
emily <- ukb %>%
filter(name == "Emily") %>%
summarise(tot = sum(n)) %>%
pull(tot) %>%
print()
## [1] 102250
How many distinct names are there for each sex?
names_per_sex <- ukb %>%
group_by(sex) %>%
distinct(name) %>%
count %>%
print()
## # A tibble: 2 x 2
## # Groups: sex [2]
## sex n
## <chr> <int>
## 1 F 18823
## 2 M 14378
What is the most popular name in the dataset?
most_popular <- ukb %>%
group_by(name) %>%
summarise(tot = sum(n), .groups = "drop") %>%
arrange(desc(tot)) %>%
slice(1) %>%
pull(name)
## alternatively, this will give you all the top names if there are ties
most_popular <- ukb %>%
group_by(name) %>%
summarise(tot = sum(n), .groups = "drop") %>%
filter(rank(tot) == max(rank(tot))) %>%
pull(name) %>%
print()
## [1] "Jack"
How many babies were born each year for each sex? Make a plot.
babies_per_year <- ukb %>%
group_by(year, sex) %>%
summarise(tot = sum(n), .groups = "drop")
ggplot(babies_per_year, aes(year, tot, color = sex)) +
geom_line()
Load the dataset dataskills::personality.
Select only the personality question columns (not the user_id or date).
q_only <- dataskills::personality %>%
select(-user_id, -date) %>%
glimpse()
## Rows: 15,000
## Columns: 41
## $ Op1 <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, 6, …
## $ Ne1 <dbl> 4, 0, 0, 4, 1, 2, 3, 4, 0, 3, 3, 3, 2, 1, 1, …
## $ Ne2 <dbl> 0, 6, 6, 4, 2, 1, 2, 3, 1, 2, 5, 5, 3, 1, 1, …
## $ Op2 <dbl> 6, 0, 0, 4, 6, 4, 4, 0, 0, 3, 4, 3, 3, 4, 5, …
## $ Ex1 <dbl> 3, 0, 0, 2, 2, 4, 4, 3, 5, 4, 1, 1, 3, 3, 1, …
## $ Ex2 <dbl> 3, 0, 0, 3, 3, 4, 5, 2, 5, 3, 4, 1, 3, 2, 1, …
## $ Co1 <dbl> 3, 0, 0, 3, 5, 4, 3, 4, 5, 3, 3, 3, 1, 5, 5, …
## $ Co2 <dbl> 3, 0, 0, 3, 4, 3, 3, 4, 5, 3, 5, 3, 3, 4, 5, …
## $ Ne3 <dbl> 0, 0, 0, 1, 0, 1, 4, 4, 0, 4, 2, 5, 1, 2, 5, …
## $ Ag1 <dbl> 2, 0, 0, 4, 6, 5, 5, 4, 2, 5, 4, 3, 2, 4, 5, …
## $ Ag2 <dbl> 1, 6, 6, 0, 5, 4, 5, 3, 4, 3, 5, 1, 5, 4, 2, …
## $ Ne4 <dbl> 3, 6, 6, 2, 3, 2, 3, 3, 0, 4, 4, 5, 5, 4, 5, …
## $ Ex3 <dbl> 3, 6, 5, 5, 3, 3, 3, 0, 6, 1, 4, 2, 3, 2, 1, …
## $ Co3 <dbl> 2, 0, 1, 3, 4, 4, 5, 4, 5, 3, 4, 3, 4, 4, 5, …
## $ Op3 <dbl> 2, 6, 5, 5, 5, 4, 3, 2, 4, 3, 3, 6, 5, 5, 6, …
## $ Ex4 <dbl> 1, 0, 1, 3, 3, 3, 4, 3, 5, 3, 2, 0, 3, 3, 1, …
## $ Op4 <dbl> 3, 0, 1, 6, 6, 3, 3, 0, 6, 3, 4, 5, 4, 5, 6, …
## $ Ex5 <dbl> 3, 0, 1, 6, 3, 3, 4, 2, 5, 2, 2, 4, 2, 3, 0, …
## $ Ag3 <dbl> 1, 0, 1, 1, 0, 4, 4, 4, 3, 3, 4, 4, 3, 4, 4, …
## $ Co4 <dbl> 3, 6, 5, 5, 5, 3, 2, 4, 3, 1, 4, 3, 1, 2, 4, …
## $ Co5 <dbl> 0, 6, 5, 5, 5, 3, 3, 1, 5, 1, 2, 4, 4, 4, 2, …
## $ Ne5 <dbl> 3, 0, 1, 4, 1, 1, 4, 5, 0, 3, 4, 6, 2, 0, 1, …
## $ Op5 <dbl> 6, 6, 5, 2, 5, 4, 3, 2, 6, 6, 2, 4, 3, 4, 6, …
## $ Ag4 <dbl> 1, 0, 1, 4, 6, 5, 5, 6, 6, 6, 4, 2, 4, 5, 4, …
## $ Op6 <dbl> 0, 6, 5, 1, 6, 4, 6, 0, 0, 3, 5, 3, 5, 5, 5, …
## $ Co6 <dbl> 6, 0, 1, 4, 6, 5, 6, 5, 4, 3, 5, 5, 4, 6, 6, …
## $ Ex6 <dbl> 3, 6, 5, 3, 0, 4, 3, 1, 6, 3, 2, 1, 4, 2, 1, …
## $ Ne6 <dbl> 1, 6, 5, 1, 0, 1, 3, 4, 0, 4, 4, 5, 2, 1, 5, …
## $ Co7 <dbl> 3, 6, 5, 1, 3, 4, NA, 2, 3, 3, 2, 2, 4, 2, 5,…
## $ Ag5 <dbl> 3, 6, 5, 0, 2, 5, 6, 2, 2, 3, 4, 1, 3, 5, 2, …
## $ Co8 <dbl> 3, 0, 1, 1, 3, 4, 3, 0, 1, 3, 2, 2, 1, 2, 4, …
## $ Ex7 <dbl> 3, 6, 5, 4, 1, 2, 5, 3, 6, 3, 4, 3, 5, 1, 1, …
## $ Ne7 <dbl> NA, 0, 1, 2, 0, 2, 4, 4, 0, 3, 2, 5, 1, 2, 5,…
## $ Co9 <dbl> 3, 6, 5, 4, 3, 4, 5, 3, 5, 3, 4, 3, 4, 4, 2, …
## $ Op7 <dbl> 0, 6, 5, 5, 5, 4, 6, 2, 1, 3, 2, 4, 5, 5, 6, …
## $ Ne8 <dbl> 2, 0, 1, 1, 1, 1, 5, 4, 0, 4, 4, 5, 1, 2, 5, …
## $ Ag6 <dbl> NA, 6, 5, 2, 3, 4, 5, 6, 1, 3, 4, 2, 3, 5, 1,…
## $ Ag7 <dbl> 3, 0, 1, 1, 1, 3, 3, 5, 0, 3, 2, 1, 2, 3, 5, …
## $ Co10 <dbl> 1, 6, 5, 5, 3, 5, 1, 2, 5, 2, 4, 3, 4, 4, 3, …
## $ Ex8 <dbl> 2, 0, 1, 4, 3, 4, 2, 4, 6, 2, 4, 0, 4, 4, 1, …
## $ Ex9 <dbl> 4, 6, 5, 5, 5, 2, 3, 3, 6, 3, 3, 4, 4, 3, 2, …
Select the user_id
column and all of the columns with questions about openness.
openness <- dataskills::personality %>%
select(user_id, starts_with("Op")) %>%
glimpse()
## Rows: 15,000
## Columns: 8
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 68…
## $ Op1 <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, …
## $ Op2 <dbl> 6, 0, 0, 4, 6, 4, 4, 0, 0, 3, 4, 3, 3, 4, …
## $ Op3 <dbl> 2, 6, 5, 5, 5, 4, 3, 2, 4, 3, 3, 6, 5, 5, …
## $ Op4 <dbl> 3, 0, 1, 6, 6, 3, 3, 0, 6, 3, 4, 5, 4, 5, …
## $ Op5 <dbl> 6, 6, 5, 2, 5, 4, 3, 2, 6, 6, 2, 4, 3, 4, …
## $ Op6 <dbl> 0, 6, 5, 1, 6, 4, 6, 0, 0, 3, 5, 3, 5, 5, …
## $ Op7 <dbl> 0, 6, 5, 5, 5, 4, 6, 2, 1, 3, 2, 4, 5, 5, …
Select the user_id
column and all of the columns with the first question for each personality trait.
q1 <- dataskills::personality %>%
select(user_id, ends_with("1")) %>%
glimpse()
## Rows: 15,000
## Columns: 6
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 68…
## $ Op1 <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, …
## $ Ne1 <dbl> 4, 0, 0, 4, 1, 2, 3, 4, 0, 3, 3, 3, 2, 1, …
## $ Ex1 <dbl> 3, 0, 0, 2, 2, 4, 4, 3, 5, 4, 1, 1, 3, 3, …
## $ Co1 <dbl> 3, 0, 0, 3, 5, 4, 3, 4, 5, 3, 3, 3, 1, 5, …
## $ Ag1 <dbl> 2, 0, 0, 4, 6, 5, 5, 4, 2, 5, 4, 3, 2, 4, …
The code below sets up a fake dataset where 10 subjects respond to 20 trials with a dv
on a 5-point Likert scale.
set.seed(10)
fake_data <- tibble(
subj_id = rep(1:10, each = 20),
trial = rep(1:20, times = 10),
dv = sample.int(5, 10*20, TRUE)
)
You want to know how many times each subject responded with the same dv as their last trial. For example, if someone responded 2,3,3,3,4 for five trials they would have repeated their last response on the third and fourth trials. Use an offset function to determine how many times each subject repeated a response.
repeated_data <- fake_data %>%
group_by(subj_id) %>%
mutate(repeated = dv == lag(dv)) %>%
summarise(repeats = sum(repeated, na.rm = TRUE),
.groups = "drop") %>%
print()
## # A tibble: 10 x 2
## subj_id repeats
## <int> <int>
## 1 1 4
## 2 2 3
## 3 3 6
## 4 4 4
## 5 5 5
## 6 6 2
## 7 7 3
## 8 8 4
## 9 9 5
## 10 10 4
Create a table too_many_repeats
with the top two repeaters from repeated_data
(and anyone who might be tied with them) using ranking functions.
too_many_repeats <- repeated_data %>%
mutate(rank = dense_rank(repeats)) %>%
filter(rank == max(rank) | rank == max(rank)-1) %>%
print()
## # A tibble: 3 x 3
## subj_id repeats rank
## <int> <int> <int>
## 1 3 6 5
## 2 5 5 4
## 3 9 5 4
There are several ways to complete the following two tasks. Different people will solve them different ways, but you should be able to tell if your answers make sense.
Load the dataset dataskills::family_composition from last week’s exercise.
Calculate how many siblings of each sex each person has, narrow the dataset down to people with fewer than 6 siblings, and generate at least two different ways to graph this.
family <- dataskills::family_composition %>%
gather("sibtype", "n", oldbro:twinsis) %>%
separate(sibtype, c("sibage", "sibsex"), sep = -3)
sib6 <- family %>%
group_by(user_id, sex, sibsex) %>%
summarise(n = sum(n), .groups = "drop") %>%
group_by(user_id) %>%
filter(sex %in% c("male", "female"), sum(n) < 6)
ggplot(sib6, aes(n, fill = sibsex)) +
geom_histogram(binwidth = 1, colour = "black", position = "dodge")
sib6 <- family %>%
group_by(user_id, sex, sibsex) %>%
summarise(n = sum(n), .groups = "drop") %>%
filter(sex %in% c("male", "female")) %>%
spread(sibsex, n) %>%
filter(bro + sis < 6)
ggplot(sib6, aes(bro, sis)) +
geom_count()
# or
ggplot(sib6, aes(bro, sis)) +
geom_bin2d(binwidth = c(1,1)) +
stat_bin2d(geom = "text", aes(label = ..count..),
binwidth = c(1, 1), color = "white")
Use the dataset dataskills::eye_descriptions from last week’s exercise.
Create a list of the 10 most common descriptions from the eyes dataset. Remove useless descriptions and merge redundant descriptions.
eyes <- dataskills::eye_descriptions %>%
gather("face_id", "description", t1:t50) %>%
separate(description, c("d1", "d2", "d3", "d4"), sep = "(,|;|\\/)+", extra = "merge", fill = "right") %>%
gather("desc_n", "description", d1:d4) %>%
filter(!is.na(description)) %>% # gets rid of rows with no description
mutate(
description = trimws(description), # get rid of white space around string
description = tolower(description) # make all characters lowercase
) %>%
group_by(description) %>%
summarise(n = n(), .groups = "drop") %>% # count occurrences of each description
arrange(desc(n)) %>% # sort by count (descending)
filter(nchar(description) > 1) %>% # get rid of 1-character descriptions
filter(row_number() < 11)
# displays the table in a nice format
knitr::kable(eyes)
description | n |
---|---|
brown | 364 |
blue | 314 |
small | 276 |
pretty | 261 |
big | 240 |
round | 233 |
sad | 225 |
tired | 219 |
dark | 190 |
average | 176 |