Class Activity 27

# load the necessary libraries
library(tidyverse) 
library(tidymodels)
library(yardstick) # extra package for getting metrics
library(parsnip) # tidy interface to models
library(ggthemes)
library(vip)
library(ISLR)
library(rpart.plot)
library(janitor)
library(ranger)

fire <- read_csv("https://raw.githubusercontent.com/deepbas/statdatasets/main/Algeriafires.csv")
fire <- fire %>% clean_names() %>% 
  drop_na() %>% 
  mutate_at(c(10,13), as.numeric) %>%
  mutate(classes = as.factor(classes)) %>%
  select(-year, -day, -month)

Group Activity 1

Use the fire data set and predict fire using all available predictor variables.

  1. Split the dataset into training and test set by the proportion \(80\) to \(20\), create a 10 fold cross validation object, and a recipe t0 preprocess the data.
Click for answer

Answer:

set.seed(314) # Remember to always set your seed.

fire_split <- initial_split(fire, prop = 0.80,  strata = classes)

fire_train <- fire_split %>% training()
fire_test <- fire_split %>% testing()

# Create folds for cross validation on the training data set
fire_folds <- vfold_cv(fire_train, v = 10, strata = classes)

fire_recipe <- recipe(classes ~ ., data = fire_train) %>%
 step_dummy(all_nominal(), -all_outcomes()) 
  1. Specify a decision tree classification model with rpart computational engine. Prepare the model for tuning (i.e., fitting with a range of parameters for validation purposes).
Click for answer

Answer:

tree_model <- decision_tree(cost_complexity = tune(),
                            tree_depth = tune(),
                            min_n = tune()) %>% 
              set_engine('rpart') %>% 
              set_mode('classification')
  1. Combine the model and recipe into a workflow to easily manage the model-building process.
Click for answer

Answer:

tree_workflow <- workflow() %>% 
                 add_model(tree_model) %>% 
                 add_recipe(fire_recipe)
  1. Create a grid of hyper-parameter values to test
Click for answer

Answer:

tree_grid <- grid_random(cost_complexity(),
                          tree_depth(),
                          min_n(), 
                          size = 10)
  1. Tune decision tree workflow
Click for answer

Answer:

set.seed(314)
tree_tuning <- tree_workflow %>% 
               tune_grid(resamples = fire_folds,
                         grid = tree_grid)
  1. Show the best models under the accuracy criteria.
Click for answer

Answer:

tree_tuning %>% show_best(metric = 'accuracy')
# A tibble: 5 × 9
  cost_complexity tree_depth min_n .metric  .estimator  mean     n std_err
            <dbl>      <int> <int> <chr>    <chr>      <dbl> <int>   <dbl>
1        6.85e- 8          9     2 accuracy binary     0.974    10  0.0118
2        1.37e-10          6     3 accuracy binary     0.968    10  0.0143
3        5.22e- 3          3    18 accuracy binary     0.963    10  0.0161
4        1.03e- 4         11    26 accuracy binary     0.963    10  0.0161
5        5.77e- 3          6    33 accuracy binary     0.963    10  0.0161
# ℹ 1 more variable: .config <chr>
  1. Select best model based on accuracy and view the best parameters. What is the corresponding tree depth?
Click for answer

Answer:

best_tree <- tree_tuning %>%  select_best(metric = 'accuracy')
best_tree
# A tibble: 1 × 4
  cost_complexity tree_depth min_n .config              
            <dbl>      <int> <int> <chr>                
1    0.0000000685          9     2 Preprocessor1_Model04
  1. Using the best_tree object, finalize the workflow using finalize_workflow().
Click for answer

Answer:

final_tree_workflow <- tree_workflow %>% finalize_workflow(best_tree)
  1. Fit the train data to the finalized workflow and extract the fit.
Click for answer

Answer:

tree_wf_fit <- final_tree_workflow %>% fit(data = fire_train)
tree_fit <- tree_wf_fit %>%  extract_fit_parsnip()
  1. Construct variable importance plot. What can you conclude from this plot?
Click for answer

Answer:

vip(tree_fit) 

# Bars with different colors
vi_data <- vi(tree_fit)
vi_data$Variable <- fct_reorder(vi_data$Variable, vi_data$Importance)
ggplot(vi_data, aes(x = Variable, y = Importance, fill = Variable)) +
  geom_col(aes(fill = Variable)) +  
  scale_fill_viridis_d(option = "rocket", direction = -1) +   
  labs(title = "Variable Importance", x = "Variables", y = "Importance") +
  theme_minimal() +
  coord_flip()

  1. Construct a decision tree. What do you see in this plot?
Click for answer

Answer:

rpart.plot(tree_fit$fit, roundint = FALSE)


Group Activity 2

Use the fire dataset again to fit a random forest algorithm to produce optimal set of variables used in predicting fire. Use the same recipe defined earlier in group activity 1.

  1. Specify a decision tree classification model with ranger computational engine and impurity for variable importance. Prepare the model for tuning (i.e., fitting with a range of parameters for validation purposes).
Click for answer

Answer:

rf_model <- rand_forest(mtry = tune(),
                        trees = tune(),
                        min_n = tune()) %>% 
            set_engine('ranger', importance = "impurity") %>% 
            set_mode('classification')
  1. Define a workflow object.
Click for answer

Answer:

rf_workflow <- workflow() %>% 
               add_model(rf_model) %>% 
               add_recipe(fire_recipe)
  1. Create a grid of hyper parameter values to test. Try different values.
Click for answer

Answer:

rf_grid <- grid_random(mtry() %>% range_set(c(1, 8)),
                       trees(),
                       min_n(),
                       size = 10)
  1. Tune the random forest workflow. Use the fire_folds object from before with 10 cross validation routine.
Click for answer

Answer:

rf_tuning <- rf_workflow %>% 
             tune_grid(resamples = fire_folds,
                       grid = rf_grid)
  1. Select the best model based on accuracy.
Click for answer

Answer:

best_rf <- rf_tuning %>% 
           select_best(metric = 'accuracy')
  1. Finalize the workflow, fit the model, and extract the parameters.
Click for answer

Answer:

final_rf_workflow <- rf_workflow %>% 
                     finalize_workflow(best_rf)
rf_wf_fit <- final_rf_workflow %>% 
             fit(data = fire_train)
rf_fit <- rf_wf_fit %>% 
          extract_fit_parsnip()
  1. Plot the variable importance. What can you conclude from this plot?
Click for answer

Answer:

vip(rf_fit)