Class Activity 9

# load the necessary libraries
library(tidyverse)

Your Turn 1

a) read_csv()

Use read_csv() to import the desserts data set from https://raw.githubusercontent.com/deepbas/statdatasets/main/desserts.csv. Use glimpse to see if the data import is alright.

url <- "https://raw.githubusercontent.com/deepbas/statdatasets/main/desserts.csv"
desserts <- read_csv(url)
glimpse(desserts)
Rows: 549
Columns: 16
$ series                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ episode               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
$ baker                 <chr> "Annetha", "David", "Edd", "Jasminder", "Jonatha…
$ technical             <chr> "2nd", "3rd", "1st", "N/A", "9th", "N/A", "8th",…
$ result                <chr> "IN", "IN", "IN", "IN", "IN", "IN", "IN", "IN", …
$ uk_airdate            <chr> "17 August 2010", "17 August 2010", "17 August 2…
$ us_season             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ us_airdate            <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ showstopper_chocolate <chr> "chocolate", "chocolate", "no chocolate", "no ch…
$ showstopper_dessert   <chr> "other", "other", "other", "other", "other", "ca…
$ showstopper_fruit     <chr> "no fruit", "no fruit", "no fruit", "no fruit", …
$ showstopper_nut       <chr> "no nut", "no nut", "no nut", "no nut", "almond"…
$ signature_chocolate   <chr> "no chocolate", "chocolate", "no chocolate", "no…
$ signature_dessert     <chr> "cake", "cake", "cake", "cake", "cake", "cake", …
$ signature_fruit       <chr> "no fruit", "fruit", "fruit", "fruit", "fruit", …
$ signature_nut         <chr> "no nut", "no nut", "no nut", "no nut", "no nut"…

b) Are there any issues with the data import? If so, what are they?

Click for answer

Answer: Based on the output of glimpse, we can see that the ‘technical’ column should be a numeric column and the ‘uk_airdate’ column should be a date column. We can also identify any issues with missing values.

# your r-code

desserts <- read_csv(url,
  col_types = list(
    technical = col_number(),
    uk_airdate = col_date()
  )
)

problems(desserts)
# A tibble: 556 × 5
     row   col expected        actual         file 
   <int> <int> <chr>           <chr>          <chr>
 1     2     6 date in ISO8601 17 August 2010 ""   
 2     3     6 date in ISO8601 17 August 2010 ""   
 3     4     6 date in ISO8601 17 August 2010 ""   
 4     5     4 a number        N/A            ""   
 5     5     6 date in ISO8601 17 August 2010 ""   
 6     6     6 date in ISO8601 17 August 2010 ""   
 7     7     4 a number        N/A            ""   
 8     7     6 date in ISO8601 17 August 2010 ""   
 9     8     6 date in ISO8601 17 August 2010 ""   
10     9     4 a number        N/A            ""   
# ℹ 546 more rows

c) Import the dataset with correct data types, if needed. Fix the problems, if any.

Click for answer
desserts <- read_csv(url,
  col_types = list(
    technical = col_number(),
    uk_airdate = col_date()
  )
)

problems(desserts)
# A tibble: 556 × 5
     row   col expected        actual         file 
   <int> <int> <chr>           <chr>          <chr>
 1     2     6 date in ISO8601 17 August 2010 ""   
 2     3     6 date in ISO8601 17 August 2010 ""   
 3     4     6 date in ISO8601 17 August 2010 ""   
 4     5     4 a number        N/A            ""   
 5     5     6 date in ISO8601 17 August 2010 ""   
 6     6     6 date in ISO8601 17 August 2010 ""   
 7     7     4 a number        N/A            ""   
 8     7     6 date in ISO8601 17 August 2010 ""   
 9     8     6 date in ISO8601 17 August 2010 ""   
10     9     4 a number        N/A            ""   
# ℹ 546 more rows
desserts <- read_csv(url,
    col_types = list(
    technical = col_number(), 
    uk_airdate = col_date(format = "%d %B %Y")
  ) 
)

problems(desserts)
# A tibble: 7 × 5
    row   col expected actual file 
  <int> <int> <chr>    <chr>  <chr>
1     5     4 a number N/A    ""   
2     7     4 a number N/A    ""   
3     9     4 a number N/A    ""   
4    11     4 a number N/A    ""   
5    35     4 a number N/A    ""   
6    36     4 a number N/A    ""   
7    37     4 a number N/A    ""   
desserts <- read_csv(url,
  col_types = list(
    technical = col_number(), 
    uk_airdate = col_date(format = "%d %B %Y")
  ),
  na = c("", "NA", "N/A")
)

problems(desserts)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>

Your Turn 2

Use the appropriate read_<type>() function to import the datasets provided in the course webpage. Identify and fix any issues you encounter.

a) Importing simple data:

Click for answer
simple1 <- readr::read_csv("data/simple-1.dat")
problems(simple1)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
simple1
# A tibble: 3 × 4
  initials state   age time  
  <chr>    <chr> <dbl> <time>
1 vib      MA       61 06:01 
2 adc      TX       45 05:45 
3 kme      CT       50 04:19 

b) Importing mildly tricky data:

Click for answer
mild1 <- readr::read_delim("data/mild-1.csv", delim = "|")
problems(mild1)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
mild1
# A tibble: 10 × 5
   first   last      address                   city         postcode
   <chr>   <chr>     <chr>                     <chr>        <chr>   
 1 Leah    Downs     688-5741 Ut St.           Owensboro    V9Z 9K2 
 2 Boris   Kirby     257-5422 Vel Avenue       Rialto       C6I 9S0 
 3 Naida   Franco    809-5528 Tristique Avenue Atwater      T8K 7U8 
 4 Xena    Tucker    7218 A St.                Grand Forks  M6O 1X4 
 5 Rylee   Wise      155-6070 Purus. St.       Bradford     65359   
 6 Baxter  Gallagher 2415 Ligula. St.          Carbondale   55211   
 7 Griffin Benjamin  3261 Ac St.               Guayama      94450   
 8 Rinah   Bradley   787-9626 Eget Avenue      Norton       17673   
 9 Tobias  Walter    4717 Mauris. Street       Attleboro    73678   
10 Boris   Farley    893-8193 Quisque Avenue   San Clemente 74492   

c) Importing tricky data 1:

Click for answer
tricky1 <-  read_csv("data/tricky-1.csv")
problems(tricky1)
# A tibble: 2 × 5
    row   col expected  actual    file                                          
  <int> <int> <chr>     <chr>     <chr>                                         
1     4     4 5 columns 4 columns /home/deepak/Desktop/website/stat220-spring24…
2     7     4 5 columns 4 columns /home/deepak/Desktop/website/stat220-spring24…
# Fix missing values
tricky1[3, ] <- c(tricky1[3, 1:2], NA, tricky1[3, 3:4])
tricky1[6, ] <- c(tricky1[4, 1], NA, tricky1[4, 3:5])
tricky1
# A tibble: 10 × 5
   first   last     address                 city         postcode
   <chr>   <chr>    <chr>                   <chr>        <chr>   
 1 Leah    Downs    688-5741 Ut St.         Owensboro    V9Z 9K2 
 2 Boris   Kirby    257-5422 Vel Avenue     Rialto       C6I 9S0 
 3 Naida   Franco   <NA>                    Atwater      T8K 7U8 
 4 Xena    Tucker   7218 A St.              Grand Forks  M6O 1X4 
 5 Rylee   Wise     155-6070 Purus. St.     Bradford     65359   
 6 Xena    <NA>     7218 A St.              Grand Forks  M6O 1X4 
 7 Griffin Benjamin 3261 Ac St.             Guayama      94450   
 8 Rinah   Bradley  787-9626 Eget Avenue    Norton       17673   
 9 Tobias  Walter   4717 Mauris. Street     Attleboro    73678   
10 Boris   Farley   893-8193 Quisque Avenue San Clemente 74492   

d) Importing tricky data 2:

Click for answer
tricky2 <- read_csv("data/tricky-2.csv")
problems(tricky2)
# A tibble: 0 × 5
# ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
# Fix missing values
tricky2_part1 <- read_csv("data/tricky-2.csv", n_max = 7) %>%
  separate(city, c("city", "state"), sep = ", ") %>%
  select(-c(7))

tricky2_part2 <- read_csv(
  "data/tricky-2.csv",
  skip = 8,
  col_names = c("iata", "airport", "city", "state",  "latitude", "longitude")
)

# Combine parts
data_combined <- full_join(tricky2_part1, tricky2_part2)
data_combined
# A tibble: 16 × 6
   iata  airport                  city             state latitude longitude
   <chr> <chr>                    <chr>            <chr>    <dbl>     <dbl>
 1 00M   Thigpen                  Bay Springs      MS        32.0     -89.2
 2 00R   Livingston Municipal     Livingston       TX        30.7     -95.0
 3 00V   Meadow Lake              Colorado Springs CO        38.9    -105. 
 4 01G   Perry-Warsaw             Perry            NY        42.7     -78.1
 5 01J   Hilliard Airpark         Hilliard         FL        30.7     -81.9
 6 01M   Tishomingo County        Belmont          MS        34.5     -88.2
 7 02A   Gragg-Wade               Clanton          AL        32.9     -86.6
 8 02C   Capitol                  Brookfield       WI        43.1     -88.2
 9 02G   Columbiana County        East Liverpool   OH        40.7     -80.6
10 03D   Memphis Memorial         Memphis          MO        40.4     -92.2
11 04M   Calhoun County           Pittsboro        MS        33.9     -89.3
12 04Y   Hawley Municipal         Hawley           MN        46.9     -96.4
13 05C   Griffith-Merrillville    Griffith         IN        41.5     -87.4
14 05F   Gatesville - City/County Gatesville       TX        31.4     -97.8
15 05U   Eureka                   Eureka           NV        39.6    -116. 
16 07M   Riverside Municipal      Riverside        CA        34.0    -117. 

Acknowledgement

Parts of the activities are adapted from similar activity written by Adam Loy.