# load the necessary libraries
library(tidyverse)
library(tidymodels)
library(yardstick) # extra package for getting metrics
library(parsnip) # tidy interface to models
library(ggthemes)
library(vip)
library(ISLR)
library(rpart.plot)
library(janitor)
library(ranger)
<- read_csv("https://raw.githubusercontent.com/deepbas/statdatasets/main/Algeriafires.csv")
fire <- fire %>% clean_names() %>%
fire drop_na() %>%
mutate_at(c(10,13), as.numeric) %>%
mutate(classes = as.factor(classes)) %>%
select(-year, -day, -month)
Class Activity 27
Group Activity 1
Use the fire
data set and predict fire using all available predictor variables.
- Split the dataset into training and test set by the proportion \(80\) to \(20\), create a 10 fold cross validation object, and a recipe t0 preprocess the data.
Click for answer
Answer:
set.seed(314) # Remember to always set your seed.
<- initial_split(fire, prop = 0.80, strata = classes)
fire_split
<- fire_split %>% training()
fire_train <- fire_split %>% testing()
fire_test
# Create folds for cross validation on the training data set
<- vfold_cv(fire_train, v = 10, strata = classes)
fire_folds
<- recipe(classes ~ ., data = fire_train) %>%
fire_recipe step_dummy(all_nominal(), -all_outcomes())
- Specify a decision tree classification model with
rpart
computational engine. Prepare the model for tuning (i.e., fitting with a range of parameters for validation purposes).
Click for answer
Answer:
<- decision_tree(cost_complexity = tune(),
tree_model tree_depth = tune(),
min_n = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
- Combine the model and recipe into a workflow to easily manage the model-building process.
Click for answer
Answer:
<- workflow() %>%
tree_workflow add_model(tree_model) %>%
add_recipe(fire_recipe)
- Create a grid of hyper-parameter values to test
Click for answer
Answer:
<- grid_random(cost_complexity(),
tree_grid tree_depth(),
min_n(),
size = 10)
- Tune decision tree workflow
Click for answer
Answer:
set.seed(314)
<- tree_workflow %>%
tree_tuning tune_grid(resamples = fire_folds,
grid = tree_grid)
- Show the best models under the accuracy criteria.
Click for answer
Answer:
%>% show_best(metric = 'accuracy') tree_tuning
# A tibble: 5 × 9
cost_complexity tree_depth min_n .metric .estimator mean n std_err
<dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
1 6.85e- 8 9 2 accuracy binary 0.974 10 0.0118
2 1.37e-10 6 3 accuracy binary 0.968 10 0.0143
3 5.22e- 3 3 18 accuracy binary 0.963 10 0.0161
4 1.03e- 4 11 26 accuracy binary 0.963 10 0.0161
5 5.77e- 3 6 33 accuracy binary 0.963 10 0.0161
# ℹ 1 more variable: .config <chr>
- Select best model based on accuracy and view the best parameters. What is the corresponding tree depth?
Click for answer
Answer:
<- tree_tuning %>% select_best(metric = 'accuracy')
best_tree best_tree
# A tibble: 1 × 4
cost_complexity tree_depth min_n .config
<dbl> <int> <int> <chr>
1 0.0000000685 9 2 Preprocessor1_Model04
- Using the
best_tree
object, finalize the workflow usingfinalize_workflow()
.
Click for answer
Answer:
<- tree_workflow %>% finalize_workflow(best_tree) final_tree_workflow
- Fit the train data to the finalized workflow and extract the fit.
Click for answer
Answer:
<- final_tree_workflow %>% fit(data = fire_train) tree_wf_fit
<- tree_wf_fit %>% extract_fit_parsnip() tree_fit
- Construct variable importance plot. What can you conclude from this plot?
Click for answer
Answer:
vip(tree_fit)
# Bars with different colors
<- vi(tree_fit)
vi_data $Variable <- fct_reorder(vi_data$Variable, vi_data$Importance)
vi_dataggplot(vi_data, aes(x = Variable, y = Importance, fill = Variable)) +
geom_col(aes(fill = Variable)) +
scale_fill_viridis_d(option = "rocket", direction = -1) +
labs(title = "Variable Importance", x = "Variables", y = "Importance") +
theme_minimal() +
coord_flip()
- Construct a decision tree. What do you see in this plot?
Click for answer
Answer:
rpart.plot(tree_fit$fit, roundint = FALSE)
Group Activity 2
Use the fire
dataset again to fit a random forest algorithm to produce optimal set of variables used in predicting fire. Use the same recipe defined earlier in group activity 1.
- Specify a decision tree classification model with
ranger
computational engine andimpurity
for variable importance. Prepare the model for tuning (i.e., fitting with a range of parameters for validation purposes).
Click for answer
Answer:
<- rand_forest(mtry = tune(),
rf_model trees = tune(),
min_n = tune()) %>%
set_engine('ranger', importance = "impurity") %>%
set_mode('classification')
- Define a workflow object.
Click for answer
Answer:
<- workflow() %>%
rf_workflow add_model(rf_model) %>%
add_recipe(fire_recipe)
- Create a grid of hyper parameter values to test. Try different values.
Click for answer
Answer:
<- grid_random(mtry() %>% range_set(c(1, 8)),
rf_grid trees(),
min_n(),
size = 10)
- Tune the random forest workflow. Use the
fire_folds
object from before with 10 cross validation routine.
Click for answer
Answer:
<- rf_workflow %>%
rf_tuning tune_grid(resamples = fire_folds,
grid = rf_grid)
- Select the best model based on accuracy.
Click for answer
Answer:
<- rf_tuning %>%
best_rf select_best(metric = 'accuracy')
- Finalize the workflow, fit the model, and extract the parameters.
Click for answer
Answer:
<- rf_workflow %>%
final_rf_workflow finalize_workflow(best_rf)
<- final_rf_workflow %>%
rf_wf_fit fit(data = fire_train)
<- rf_wf_fit %>%
rf_fit extract_fit_parsnip()
- Plot the variable importance. What can you conclude from this plot?
Click for answer
Answer:
vip(rf_fit)