# load the necessary libraries
library(tidyverse)
Class Activity 9
Your Turn 1
a) read_csv()
Use read_csv()
to import the desserts
data set from https://raw.githubusercontent.com/deepbas/statdatasets/main/desserts.csv. Use glimpse
to see if the data import is alright.
<- "https://raw.githubusercontent.com/deepbas/statdatasets/main/desserts.csv"
url <- read_csv(url)
desserts glimpse(desserts)
Rows: 549
Columns: 16
$ series <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ episode <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
$ baker <chr> "Annetha", "David", "Edd", "Jasminder", "Jonatha…
$ technical <chr> "2nd", "3rd", "1st", "N/A", "9th", "N/A", "8th",…
$ result <chr> "IN", "IN", "IN", "IN", "IN", "IN", "IN", "IN", …
$ uk_airdate <chr> "17 August 2010", "17 August 2010", "17 August 2…
$ us_season <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ us_airdate <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ showstopper_chocolate <chr> "chocolate", "chocolate", "no chocolate", "no ch…
$ showstopper_dessert <chr> "other", "other", "other", "other", "other", "ca…
$ showstopper_fruit <chr> "no fruit", "no fruit", "no fruit", "no fruit", …
$ showstopper_nut <chr> "no nut", "no nut", "no nut", "no nut", "almond"…
$ signature_chocolate <chr> "no chocolate", "chocolate", "no chocolate", "no…
$ signature_dessert <chr> "cake", "cake", "cake", "cake", "cake", "cake", …
$ signature_fruit <chr> "no fruit", "fruit", "fruit", "fruit", "fruit", …
$ signature_nut <chr> "no nut", "no nut", "no nut", "no nut", "no nut"…
b) Are there any issues with the data import? If so, what are they?
Click for answer
Answer: Based on the output of glimpse, we can see that the ‘technical’ column should be a numeric column and the ‘uk_airdate’ column should be a date column. We can also identify any issues with missing values.
# your r-code
<- read_csv(url,
desserts col_types = list(
technical = col_number(),
uk_airdate = col_date()
)
)
problems(desserts)
# A tibble: 556 × 5
row col expected actual file
<int> <int> <chr> <chr> <chr>
1 2 6 date in ISO8601 17 August 2010 ""
2 3 6 date in ISO8601 17 August 2010 ""
3 4 6 date in ISO8601 17 August 2010 ""
4 5 4 a number N/A ""
5 5 6 date in ISO8601 17 August 2010 ""
6 6 6 date in ISO8601 17 August 2010 ""
7 7 4 a number N/A ""
8 7 6 date in ISO8601 17 August 2010 ""
9 8 6 date in ISO8601 17 August 2010 ""
10 9 4 a number N/A ""
# ℹ 546 more rows
c) Import the dataset with correct data types, if needed. Fix the problems, if any.
Click for answer
<- read_csv(url,
desserts col_types = list(
technical = col_number(),
uk_airdate = col_date()
)
)
problems(desserts)
# A tibble: 556 × 5
row col expected actual file
<int> <int> <chr> <chr> <chr>
1 2 6 date in ISO8601 17 August 2010 ""
2 3 6 date in ISO8601 17 August 2010 ""
3 4 6 date in ISO8601 17 August 2010 ""
4 5 4 a number N/A ""
5 5 6 date in ISO8601 17 August 2010 ""
6 6 6 date in ISO8601 17 August 2010 ""
7 7 4 a number N/A ""
8 7 6 date in ISO8601 17 August 2010 ""
9 8 6 date in ISO8601 17 August 2010 ""
10 9 4 a number N/A ""
# ℹ 546 more rows
<- read_csv(url,
desserts col_types = list(
technical = col_number(),
uk_airdate = col_date(format = "%d %B %Y")
)
)
problems(desserts)
# A tibble: 7 × 5
row col expected actual file
<int> <int> <chr> <chr> <chr>
1 5 4 a number N/A ""
2 7 4 a number N/A ""
3 9 4 a number N/A ""
4 11 4 a number N/A ""
5 35 4 a number N/A ""
6 36 4 a number N/A ""
7 37 4 a number N/A ""
<- read_csv(url,
desserts col_types = list(
technical = col_number(),
uk_airdate = col_date(format = "%d %B %Y")
),na = c("", "NA", "N/A")
)
problems(desserts)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
Your Turn 2
Use the appropriate read_<type>()
function to import the datasets provided in the course webpage. Identify and fix any issues you encounter.
a) Importing simple data:
Click for answer
<- readr::read_csv("data/simple-1.dat")
simple1 problems(simple1)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
simple1
# A tibble: 3 × 4
initials state age time
<chr> <chr> <dbl> <time>
1 vib MA 61 06:01
2 adc TX 45 05:45
3 kme CT 50 04:19
b) Importing mildly tricky data:
Click for answer
<- readr::read_delim("data/mild-1.csv", delim = "|")
mild1 problems(mild1)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
mild1
# A tibble: 10 × 5
first last address city postcode
<chr> <chr> <chr> <chr> <chr>
1 Leah Downs 688-5741 Ut St. Owensboro V9Z 9K2
2 Boris Kirby 257-5422 Vel Avenue Rialto C6I 9S0
3 Naida Franco 809-5528 Tristique Avenue Atwater T8K 7U8
4 Xena Tucker 7218 A St. Grand Forks M6O 1X4
5 Rylee Wise 155-6070 Purus. St. Bradford 65359
6 Baxter Gallagher 2415 Ligula. St. Carbondale 55211
7 Griffin Benjamin 3261 Ac St. Guayama 94450
8 Rinah Bradley 787-9626 Eget Avenue Norton 17673
9 Tobias Walter 4717 Mauris. Street Attleboro 73678
10 Boris Farley 893-8193 Quisque Avenue San Clemente 74492
c) Importing tricky data 1:
Click for answer
<- read_csv("data/tricky-1.csv")
tricky1 problems(tricky1)
# A tibble: 2 × 5
row col expected actual file
<int> <int> <chr> <chr> <chr>
1 4 4 5 columns 4 columns /home/deepak/Desktop/website/stat220-spring24…
2 7 4 5 columns 4 columns /home/deepak/Desktop/website/stat220-spring24…
# Fix missing values
3, ] <- c(tricky1[3, 1:2], NA, tricky1[3, 3:4])
tricky1[6, ] <- c(tricky1[4, 1], NA, tricky1[4, 3:5])
tricky1[ tricky1
# A tibble: 10 × 5
first last address city postcode
<chr> <chr> <chr> <chr> <chr>
1 Leah Downs 688-5741 Ut St. Owensboro V9Z 9K2
2 Boris Kirby 257-5422 Vel Avenue Rialto C6I 9S0
3 Naida Franco <NA> Atwater T8K 7U8
4 Xena Tucker 7218 A St. Grand Forks M6O 1X4
5 Rylee Wise 155-6070 Purus. St. Bradford 65359
6 Xena <NA> 7218 A St. Grand Forks M6O 1X4
7 Griffin Benjamin 3261 Ac St. Guayama 94450
8 Rinah Bradley 787-9626 Eget Avenue Norton 17673
9 Tobias Walter 4717 Mauris. Street Attleboro 73678
10 Boris Farley 893-8193 Quisque Avenue San Clemente 74492
d) Importing tricky data 2:
Click for answer
<- read_csv("data/tricky-2.csv")
tricky2 problems(tricky2)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
# Fix missing values
<- read_csv("data/tricky-2.csv", n_max = 7) %>%
tricky2_part1 separate(city, c("city", "state"), sep = ", ") %>%
select(-c(7))
<- read_csv(
tricky2_part2 "data/tricky-2.csv",
skip = 8,
col_names = c("iata", "airport", "city", "state", "latitude", "longitude")
)
# Combine parts
<- full_join(tricky2_part1, tricky2_part2)
data_combined data_combined
# A tibble: 16 × 6
iata airport city state latitude longitude
<chr> <chr> <chr> <chr> <dbl> <dbl>
1 00M Thigpen Bay Springs MS 32.0 -89.2
2 00R Livingston Municipal Livingston TX 30.7 -95.0
3 00V Meadow Lake Colorado Springs CO 38.9 -105.
4 01G Perry-Warsaw Perry NY 42.7 -78.1
5 01J Hilliard Airpark Hilliard FL 30.7 -81.9
6 01M Tishomingo County Belmont MS 34.5 -88.2
7 02A Gragg-Wade Clanton AL 32.9 -86.6
8 02C Capitol Brookfield WI 43.1 -88.2
9 02G Columbiana County East Liverpool OH 40.7 -80.6
10 03D Memphis Memorial Memphis MO 40.4 -92.2
11 04M Calhoun County Pittsboro MS 33.9 -89.3
12 04Y Hawley Municipal Hawley MN 46.9 -96.4
13 05C Griffith-Merrillville Griffith IN 41.5 -87.4
14 05F Gatesville - City/County Gatesville TX 31.4 -97.8
15 05U Eureka Eureka NV 39.6 -116.
16 07M Riverside Municipal Riverside CA 34.0 -117.
Acknowledgement
Parts of the activities are adapted from similar activity written by Adam Loy.