Class Activity 17

# load the necessary libraries
library(tidyverse)
library(stringr)
library(purrr)
library(ggthemes)
library(rvest)
library(polite)

Group Activity 1

Go to the the numbers webpage and extract the table on the front page.

Click for answer

session1 <- read_html("https://www.the-numbers.com/movie/budgets/all") %>%
  html_nodes(css = "table") %>%
  html_table()

table_base <- session1 %>% .[[1]]

Find out the number of pages that contain the movie table, while looking for the changes in the url in the address bar. How does the url changes when you go to the next page?

Click for answer

Answer: The starting count of the movie gets concatenated to the url in increments of 100.

Write a for loop to store all the data in multiple pages to a single data frame. Please do the same using purrr:map_df() as well.

Click for answer

library(tidyverse)
library(rvest)

new_urls <- "https://www.the-numbers.com/movie/budgets/all/"

# Create an empty data frame
df1 <- list()

# Generate a vector of indices
index <- seq(1, 6301, 100)

# Loop through indices, scrape data, and bind the resulting data frames
start_time <- proc.time() # Capture start time
for (i in 1:length(index)) {
  url <- str_glue("{new_urls}{index[i]}")
  webpage <- read_html(url)
  table_new <- html_table(webpage)[[1]] %>%
    janitor::clean_names() %>%
    mutate(across(everything(), as.character))
  df1[[i]] <- table_new
}
end_time <- proc.time() # Capture end time
end_time - start_time # Calculate duration

df1_final <- do.call(rbind, df1)
df1_final1 <- reduce(df1, dplyr::bind_rows)

# alternate using map_df()
start_time <- proc.time() # Capture start time

urls <- map(index, function(i) str_glue({new_urls}, {index[i]}))
urls <- map(index, ~str_glue({new_urls}, {.x}))

library(tidyverse)
library(rvest)
library(glue)
library(janitor)

# Assuming 'urls' is already defined
movies_data <- map_df(urls, ~read_html(.x) %>%
                        html_table() %>%
                        .[[1]] %>%
                        janitor::clean_names() %>% 
                        mutate(across(everything(), as.character))) 
end_time <- proc.time() # Capture end time
end_time - start_time # Calculate duration

   user  system elapsed 
  4.111   0.181  69.699

movies_data %>% slice_head(n=6)

# A tibble: 6 × 6
  x     release_date movie      production_budget domestic_gross worldwide_gross
  <chr> <chr>        <chr>      <chr>             <chr>          <chr>          
1 1     Dec 9, 2022  Avatar: T… $460,000,000      $684,075,767   $2,317,514,386 
2 2     Jun 28, 2023 Indiana J… $402,300,000      $174,480,468   $383,963,057   
3 3     Apr 23, 2019 Avengers:… $400,000,000      $858,373,000   $2,788,912,285 
4 4     May 20, 2011 Pirates o… $379,000,000      $241,071,802   $1,045,713,802 
5 5     Apr 22, 2015 Avengers:… $365,000,000      $459,005,868   $1,395,316,979 
6 6     May 17, 2023 Fast X     $340,000,000      $146,126,015   $714,567,285

Group Activity 2

Go to the scrapethissite and extract the table on the front page.

Click for answer

session1 <- read_html("https://www.scrapethissite.com/pages/forms/") %>%
  html_nodes(css = "table") %>%
  html_table()

table_base <- session1 %>% .[[1]]

Find out the number of pages that contain the movie table, while looking for the changes in the url in the address bar. How does the url changes when you go to the next page?

Click for answer

Answer: The url field has ?page_num= added with the number of pages running from 1 to 24.

Write a for loop to store all the data in multiple pages to a single data frame. Please do the same using purrr:map_df() as well.

Click for answer

library(tidyverse)
library(rvest)

new_urls <- "http://scrapethissite.com/pages/forms/?page_num="

# Generate a vector of indices
index <- seq(1, 24)

df2 <- list()
start_time <- proc.time() # Capture start time

for (i in index) {
  url <- str_glue("{new_urls}{i}")
  webpage <- read_html(url)
  table_new <- html_table(webpage)[[1]] %>%
    janitor::clean_names() %>%
    #set_names(~ifelse(is.na(.) | . == "", paste("V", seq_along(.), sep=""), .)) %>%
    mutate(across(everything(), as.character))
  df2[[i]] <- table_new
}
end_time <- proc.time() # Capture end time
end_time - start_time # Calculate duration

   user  system elapsed 
  1.529   0.100   8.880

df2_final <- bind_rows(df2)
df2_final

# A tibble: 582 × 9
   team_name             year  wins  losses ot_losses win_percent goals_for_gf
   <chr>                 <chr> <chr> <chr>  <chr>     <chr>       <chr>       
 1 Boston Bruins         1990  44    24     <NA>      0.55        299         
 2 Buffalo Sabres        1990  31    30     <NA>      0.388       292         
 3 Calgary Flames        1990  46    26     <NA>      0.575       344         
 4 Chicago Blackhawks    1990  49    23     <NA>      0.613       284         
 5 Detroit Red Wings     1990  34    38     <NA>      0.425       273         
 6 Edmonton Oilers       1990  37    37     <NA>      0.463       272         
 7 Hartford Whalers      1990  31    38     <NA>      0.388       238         
 8 Los Angeles Kings     1990  46    24     <NA>      0.575       340         
 9 Minnesota North Stars 1990  27    39     <NA>      0.338       256         
10 Montreal Canadiens    1990  39    30     <NA>      0.487       273         
# ℹ 572 more rows
# ℹ 2 more variables: goals_against_ga <chr>, x <chr>

# alternate using map
urls <- map(index, function(i) str_glue({new_urls}, {i}))
urls <- map(index, ~str_glue("{new_urls}{.x}"))

start_time <- proc.time() # Capture start time
sports_data <- map_df(urls, ~read_html(.x) %>%
                  html_table() %>%
                  .[[1]] %>%
                  janitor::clean_names() %>%
                  mutate(across(everything(), as.character)))

end_time <- proc.time() # Capture end time
end_time - start_time # Calculate duration

   user  system elapsed 
  1.534   0.055   8.423

sports_data %>% slice_head(n=7)

# A tibble: 7 × 9
  team_name          year  wins  losses ot_losses win_percent goals_for_gf
  <chr>              <chr> <chr> <chr>  <chr>     <chr>       <chr>       
1 Boston Bruins      1990  44    24     <NA>      0.55        299         
2 Buffalo Sabres     1990  31    30     <NA>      0.388       292         
3 Calgary Flames     1990  46    26     <NA>      0.575       344         
4 Chicago Blackhawks 1990  49    23     <NA>      0.613       284         
5 Detroit Red Wings  1990  34    38     <NA>      0.425       273         
6 Edmonton Oilers    1990  37    37     <NA>      0.463       272         
7 Hartford Whalers   1990  31    38     <NA>      0.388       238         
# ℹ 2 more variables: goals_against_ga <chr>, x <chr>