# load the necessary libraries
library(tidyverse)
library(stringr)
library(purrr)
library(ggthemes)
library(rvest)
library(polite)
Class Activity 17
Group Activity 1
- Go to the the numbers webpage and extract the table on the front page.
Click for answer
<- read_html("https://www.the-numbers.com/movie/budgets/all") %>%
session1 html_nodes(css = "table") %>%
html_table()
<- session1 %>% .[[1]] table_base
- Find out the number of pages that contain the movie table, while looking for the changes in the url in the address bar. How does the url changes when you go to the next page?
Click for answer
Answer: The starting count of the movie gets concatenated to the url in increments of 100.
- Write a for loop to store all the data in multiple pages to a single data frame. Please do the same using
purrr:map_df()
as well.
Click for answer
library(tidyverse)
library(rvest)
<- "https://www.the-numbers.com/movie/budgets/all/"
new_urls
# Create an empty data frame
<- list()
df1
# Generate a vector of indices
<- seq(1, 6301, 100) index
# Loop through indices, scrape data, and bind the resulting data frames
<- proc.time() # Capture start time
start_time for (i in 1:length(index)) {
<- str_glue("{new_urls}{index[i]}")
url <- read_html(url)
webpage <- html_table(webpage)[[1]] %>%
table_new ::clean_names() %>%
janitormutate(across(everything(), as.character))
<- table_new
df1[[i]]
}<- proc.time() # Capture end time
end_time - start_time # Calculate duration
end_time
<- do.call(rbind, df1)
df1_final <- reduce(df1, dplyr::bind_rows) df1_final1
# alternate using map_df()
<- proc.time() # Capture start time
start_time
<- map(index, function(i) str_glue({new_urls}, {index[i]}))
urls <- map(index, ~str_glue({new_urls}, {.x}))
urls
library(tidyverse)
library(rvest)
library(glue)
library(janitor)
# Assuming 'urls' is already defined
<- map_df(urls, ~read_html(.x) %>%
movies_data html_table() %>%
1]] %>%
.[[::clean_names() %>%
janitormutate(across(everything(), as.character)))
<- proc.time() # Capture end time
end_time - start_time # Calculate duration end_time
user system elapsed
4.111 0.181 69.699
%>% slice_head(n=6) movies_data
# A tibble: 6 × 6
x release_date movie production_budget domestic_gross worldwide_gross
<chr> <chr> <chr> <chr> <chr> <chr>
1 1 Dec 9, 2022 Avatar: T… $460,000,000 $684,075,767 $2,317,514,386
2 2 Jun 28, 2023 Indiana J… $402,300,000 $174,480,468 $383,963,057
3 3 Apr 23, 2019 Avengers:… $400,000,000 $858,373,000 $2,788,912,285
4 4 May 20, 2011 Pirates o… $379,000,000 $241,071,802 $1,045,713,802
5 5 Apr 22, 2015 Avengers:… $365,000,000 $459,005,868 $1,395,316,979
6 6 May 17, 2023 Fast X $340,000,000 $146,126,015 $714,567,285
Group Activity 2
- Go to the scrapethissite and extract the table on the front page.
Click for answer
<- read_html("https://www.scrapethissite.com/pages/forms/") %>%
session1 html_nodes(css = "table") %>%
html_table()
<- session1 %>% .[[1]] table_base
- Find out the number of pages that contain the movie table, while looking for the changes in the url in the address bar. How does the url changes when you go to the next page?
Click for answer
Answer: The url field has ?page_num=
added with the number of pages running from 1 to 24.
- Write a for loop to store all the data in multiple pages to a single data frame. Please do the same using
purrr:map_df()
as well.
Click for answer
library(tidyverse)
library(rvest)
<- "http://scrapethissite.com/pages/forms/?page_num="
new_urls
# Generate a vector of indices
<- seq(1, 24) index
<- list()
df2 <- proc.time() # Capture start time
start_time
for (i in index) {
<- str_glue("{new_urls}{i}")
url <- read_html(url)
webpage <- html_table(webpage)[[1]] %>%
table_new ::clean_names() %>%
janitor#set_names(~ifelse(is.na(.) | . == "", paste("V", seq_along(.), sep=""), .)) %>%
mutate(across(everything(), as.character))
<- table_new
df2[[i]]
}<- proc.time() # Capture end time
end_time - start_time # Calculate duration end_time
user system elapsed
1.529 0.100 8.880
<- bind_rows(df2)
df2_final df2_final
# A tibble: 582 × 9
team_name year wins losses ot_losses win_percent goals_for_gf
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Boston Bruins 1990 44 24 <NA> 0.55 299
2 Buffalo Sabres 1990 31 30 <NA> 0.388 292
3 Calgary Flames 1990 46 26 <NA> 0.575 344
4 Chicago Blackhawks 1990 49 23 <NA> 0.613 284
5 Detroit Red Wings 1990 34 38 <NA> 0.425 273
6 Edmonton Oilers 1990 37 37 <NA> 0.463 272
7 Hartford Whalers 1990 31 38 <NA> 0.388 238
8 Los Angeles Kings 1990 46 24 <NA> 0.575 340
9 Minnesota North Stars 1990 27 39 <NA> 0.338 256
10 Montreal Canadiens 1990 39 30 <NA> 0.487 273
# ℹ 572 more rows
# ℹ 2 more variables: goals_against_ga <chr>, x <chr>
# alternate using map
<- map(index, function(i) str_glue({new_urls}, {i}))
urls <- map(index, ~str_glue("{new_urls}{.x}"))
urls
<- proc.time() # Capture start time
start_time <- map_df(urls, ~read_html(.x) %>%
sports_data html_table() %>%
1]] %>%
.[[::clean_names() %>%
janitormutate(across(everything(), as.character)))
<- proc.time() # Capture end time
end_time - start_time # Calculate duration end_time
user system elapsed
1.534 0.055 8.423
%>% slice_head(n=7) sports_data
# A tibble: 7 × 9
team_name year wins losses ot_losses win_percent goals_for_gf
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Boston Bruins 1990 44 24 <NA> 0.55 299
2 Buffalo Sabres 1990 31 30 <NA> 0.388 292
3 Calgary Flames 1990 46 26 <NA> 0.575 344
4 Chicago Blackhawks 1990 49 23 <NA> 0.613 284
5 Detroit Red Wings 1990 34 38 <NA> 0.425 273
6 Edmonton Oilers 1990 37 37 <NA> 0.463 272
7 Hartford Whalers 1990 31 38 <NA> 0.388 238
# ℹ 2 more variables: goals_against_ga <chr>, x <chr>