Edit the code chunks below and knit the document. You can pipe your objects to glimpse()
or print()
to display them.
Load the following data from the dataskills package (or access the linked CSV files online). Each participant is identified by a unique user_id
.
data("disgust_scores", package = "dataskills")
data("personality_scores", package = "dataskills")
data("users", package = "dataskills")
Add participant data to the disgust_scores table.
study1 <- left_join(disgust_scores, users, by = "user_id") %>%
glimpse()
## Rows: 20,000
## Columns: 8
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 15540…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 200…
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, …
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, …
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.71428…
## $ sex <chr> "female", "female", "male", "female", "m…
## $ birthyear <dbl> 1976, 1984, 1982, 1968, 1983, 1983, 1987…
Calculate the approximate age of each participant on the date they did the disgust_scores questionnaire and put this in a column called age_years
in a new table called study1_ages
.
study1_ages <- study1 %>%
mutate(
age_years = year(date) - birthyear
) %>%
glimpse()
## Rows: 20,000
## Columns: 9
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 15540…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 200…
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, …
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, …
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.71428…
## $ sex <chr> "female", "female", "male", "female", "m…
## $ birthyear <dbl> 1976, 1984, 1982, 1968, 1983, 1983, 1987…
## $ age_years <dbl> 32, 24, 26, 40, 25, 25, 21, 30, 22, 38, …
Add the participant data to the disgust_scores data, but have the columns from the participant table first.
study2 <- right_join(users, disgust_scores, by = "user_id") %>%
glimpse()
## Rows: 20,000
## Columns: 8
## $ user_id <dbl> 0, 1, 2, 2118, 2311, 3630, 4458, 4651, 4…
## $ sex <chr> NA, "female", "male", "female", "male", …
## $ birthyear <dbl> NA, 1976, 1985, 1985, 1982, 1968, 1933, …
## $ id <dbl> 1199, 1, 1599, 13332, 23, 1160, 7980, 55…
## $ date <date> 2008-10-07, 2008-07-10, 2008-10-27, 201…
## $ moral <dbl> 5.285714, 1.428571, NA, 1.000000, 4.0000…
## $ pathogen <dbl> 4.714286, 2.714286, NA, 5.000000, 4.2857…
## $ sexual <dbl> 2.1428571, 1.7142857, NA, 3.0000000, 1.8…
How many times was the disgust_scores questionnaire completed by each sex? Create a table called study2_by_sex
that has two columns: sex
and n
.
study2_by_sex <- study2 %>%
count(sex) %>%
print()
## # A tibble: 4 x 2
## sex n
## <chr> <int>
## 1 female 13886
## 2 male 6012
## 3 nonbinary 3
## 4 <NA> 99
Advanced: Make a graph of how many people completed the questionnaire each year.
study2 %>%
mutate(year = substr(date, 1, 4)) %>%
count(year) %>%
ggplot() +
geom_col(aes(year, n, fill = year)) +
labs(
x = "Year",
y = "Times Completed"
) +
guides(fill = FALSE)
Create a table with only disgust_scores and personality_scores data from the same user_id
collected on the same date
.
study3 <- inner_join(disgust_scores, personality_scores,
by = c("user_id", "date")) %>%
glimpse()
## Rows: 555
## Columns: 11
## $ id <dbl> 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34,…
## $ user_id <dbl> 155324, 155386, 155567, 155571, 155665, 1…
## $ date <date> 2008-07-11, 2008-07-12, 2008-07-14, 2008…
## $ moral <dbl> 3.000000, 1.428571, 5.571429, 2.714286, 4…
## $ pathogen <dbl> 2.571429, 3.857143, 4.714286, 6.000000, 4…
## $ sexual <dbl> 1.8571429, 3.7142857, 2.5714286, 4.428571…
## $ Ag <dbl> 4.000000, 3.142857, 5.285714, 3.714286, 2…
## $ Co <dbl> 3.300000, 2.600000, 5.700000, 3.800000, 1…
## $ Ex <dbl> 4.8888889, 4.0000000, 3.8888889, 4.555555…
## $ Ne <dbl> 2.375, 0.250, 1.125, 2.250, 3.125, 1.375,…
## $ Op <dbl> 4.714286, 5.142857, 3.142857, 2.857143, 4…
Join data from the same user_id
, regardless of date
. Does this give you the same data table as above?
study3_nodate <- inner_join(disgust_scores, personality_scores,
by = c("user_id")) %>%
glimpse()
## Rows: 677
## Columns: 12
## $ id <dbl> 1, 3, 6, 17, 18, 20, 21, 22, 24, 25, 32, …
## $ user_id <dbl> 1, 155324, 155386, 155567, 155571, 124756…
## $ date.x <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008…
## $ moral <dbl> 1.428571, 3.000000, 1.428571, 5.571429, 2…
## $ pathogen <dbl> 2.714286, 2.571429, 3.857143, 4.714286, 6…
## $ sexual <dbl> 1.7142857, 1.8571429, 3.7142857, 2.571428…
## $ date.y <date> 2006-02-08, 2008-07-11, 2008-07-12, 2008…
## $ Ag <dbl> 2.571429, 4.000000, 3.142857, 5.285714, 3…
## $ Co <dbl> 3.000000, 3.300000, 2.600000, 5.700000, 3…
## $ Ex <dbl> 2.6666667, 4.8888889, 4.0000000, 3.888888…
## $ Ne <dbl> 2.250, 2.375, 0.250, 1.125, 2.250, 3.375,…
## $ Op <dbl> 4.285714, 4.714286, 5.142857, 3.142857, 2…
Create a table of the disgust_scores and personality_scores data with each user_id:date
on a single row, containing all of the data from both tables.
study4 <- full_join(disgust_scores, personality_scores,
by = c("user_id", "date")) %>%
glimpse()
## Rows: 34,445
## Columns: 11
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 155409…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008…
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, 1…
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, 3…
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.714285…
## $ Ag <dbl> NA, 4.000000, NA, NA, 3.142857, NA, NA, N…
## $ Co <dbl> NA, 3.3, NA, NA, 2.6, NA, NA, NA, NA, NA,…
## $ Ex <dbl> NA, 4.888889, NA, NA, 4.000000, NA, NA, N…
## $ Ne <dbl> NA, 2.375, NA, NA, 0.250, NA, NA, NA, NA,…
## $ Op <dbl> NA, 4.714286, NA, NA, 5.142857, NA, NA, N…
Create a table of just the data from the disgust_scores table for users who completed the personality_scores questionnaire that same day.
study5 <- semi_join(disgust_scores, personality_scores,
by = c("user_id", "date")) %>%
glimpse()
## Rows: 555
## Columns: 6
## $ id <dbl> 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34,…
## $ user_id <dbl> 155324, 155386, 155567, 155571, 155665, 1…
## $ date <date> 2008-07-11, 2008-07-12, 2008-07-14, 2008…
## $ moral <dbl> 3.000000, 1.428571, 5.571429, 2.714286, 4…
## $ pathogen <dbl> 2.571429, 3.857143, 4.714286, 6.000000, 4…
## $ sexual <dbl> 1.8571429, 3.7142857, 2.5714286, 4.428571…
Create a table of data from users who did not complete either the personality_scores questionnaire or the disgust_scores questionnaire. (Hint: this will require two steps; use pipes.)
study6 <- users %>%
anti_join(personality_scores, by = "user_id") %>%
anti_join(disgust_scores, by = "user_id") %>%
glimpse()
## Rows: 17,728
## Columns: 3
## $ user_id <dbl> 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 3…
## $ sex <chr> "male", "female", "female", "female", "m…
## $ birthyear <dbl> 1972, 1978, 1981, 1980, 1964, 1945, 1973…
Load new user data from dataskills::users2. Bind them into a single table called users_all
.
data("users2", package = "dataskills")
users_all <- bind_rows(users, users2) %>%
glimpse()
## Rows: 112,043
## Columns: 3
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22…
## $ sex <chr> NA, "female", "male", "male", "male", "m…
## $ birthyear <dbl> NA, 1976, 1985, 1980, 1968, 1972, 1978, …
How many users are in both the first and second user table?
both_n <- dplyr::intersect(users, users2) %>% nrow() %>% print()
## [1] 11602
How many unique users are there in total across the first and second user tables?
unique_users <- dplyr::union(users, users2) %>% nrow() %>% print()
## [1] 100441
How many users are in the first, but not the second, user table?
first_users <- dplyr::setdiff(users, users2) %>% nrow() %>% print()
## [1] 40441
How many users are in the second, but not the first, user table?
second_users <- dplyr::setdiff(users2, users) %>% nrow() %>% print()
## [1] 48398