# load the necessary libraries
library(tidyverse) 
library(tidymodels)
library(yardstick) # extra package for getting metrics
library(parsnip) # tidy interface to models
library(ggthemes)
library(vip)
library(ISLR)
library(rpart.plot)
library(janitor)
library(ranger)
fire <- read_csv("https://raw.githubusercontent.com/deepbas/statdatasets/main/Algeriafires.csv")
fire <- fire %>% clean_names() %>% 
  drop_na() %>% 
  mutate_at(c(10,13), as.numeric) %>%
  mutate(classes = as.factor(classes)) %>%
  select(-year, -day, -month)Class Activity 27
Group Activity 1
Use the fire data set and predict fire using all available predictor variables.
- Split the dataset into training and test set by the proportion \(80\) to \(20\), create a 10 fold cross validation object, and a recipe t0 preprocess the data.
Click for answer
Answer:
set.seed(314) # Remember to always set your seed.
fire_split <- initial_split(fire, prop = 0.80,  strata = classes)
fire_train <- fire_split %>% training()
fire_test <- fire_split %>% testing()
# Create folds for cross validation on the training data set
fire_folds <- vfold_cv(fire_train, v = 10, strata = classes)
fire_recipe <- recipe(classes ~ ., data = fire_train) %>%
 step_dummy(all_nominal(), -all_outcomes()) - Specify a decision tree classification model with rpartcomputational engine. Prepare the model for tuning (i.e., fitting with a range of parameters for validation purposes).
Click for answer
Answer:
tree_model <- decision_tree(cost_complexity = tune(),
                            tree_depth = tune(),
                            min_n = tune()) %>% 
              set_engine('rpart') %>% 
              set_mode('classification')- Combine the model and recipe into a workflow to easily manage the model-building process.
Click for answer
Answer:
tree_workflow <- workflow() %>% 
                 add_model(tree_model) %>% 
                 add_recipe(fire_recipe)- Create a grid of hyper-parameter values to test
Click for answer
Answer:
tree_grid <- grid_random(cost_complexity(),
                          tree_depth(),
                          min_n(), 
                          size = 10)- Tune decision tree workflow
Click for answer
Answer:
set.seed(314)
tree_tuning <- tree_workflow %>% 
               tune_grid(resamples = fire_folds,
                         grid = tree_grid)- Show the best models under the accuracy criteria.
Click for answer
Answer:
tree_tuning %>% show_best(metric = 'accuracy')# A tibble: 5 × 9
  cost_complexity tree_depth min_n .metric  .estimator  mean     n std_err
            <dbl>      <int> <int> <chr>    <chr>      <dbl> <int>   <dbl>
1        6.85e- 8          9     2 accuracy binary     0.974    10  0.0118
2        1.37e-10          6     3 accuracy binary     0.968    10  0.0143
3        5.22e- 3          3    18 accuracy binary     0.963    10  0.0161
4        1.03e- 4         11    26 accuracy binary     0.963    10  0.0161
5        5.77e- 3          6    33 accuracy binary     0.963    10  0.0161
# ℹ 1 more variable: .config <chr>- Select best model based on accuracy and view the best parameters. What is the corresponding tree depth?
Click for answer
Answer:
best_tree <- tree_tuning %>%  select_best(metric = 'accuracy')
best_tree# A tibble: 1 × 4
  cost_complexity tree_depth min_n .config              
            <dbl>      <int> <int> <chr>                
1    0.0000000685          9     2 Preprocessor1_Model04- Using the best_treeobject, finalize the workflow usingfinalize_workflow().
Click for answer
Answer:
final_tree_workflow <- tree_workflow %>% finalize_workflow(best_tree)- Fit the train data to the finalized workflow and extract the fit.
Click for answer
Answer:
tree_wf_fit <- final_tree_workflow %>% fit(data = fire_train)tree_fit <- tree_wf_fit %>%  extract_fit_parsnip()- Construct variable importance plot. What can you conclude from this plot?
Click for answer
Answer:
vip(tree_fit) 
# Bars with different colors
vi_data <- vi(tree_fit)
vi_data$Variable <- fct_reorder(vi_data$Variable, vi_data$Importance)
ggplot(vi_data, aes(x = Variable, y = Importance, fill = Variable)) +
  geom_col(aes(fill = Variable)) +  
  scale_fill_viridis_d(option = "rocket", direction = -1) +   
  labs(title = "Variable Importance", x = "Variables", y = "Importance") +
  theme_minimal() +
  coord_flip()
- Construct a decision tree. What do you see in this plot?
Click for answer
Answer:
rpart.plot(tree_fit$fit, roundint = FALSE)
Group Activity 2
Use the fire dataset again to fit a random forest algorithm to produce optimal set of variables used in predicting fire. Use the same recipe defined earlier in group activity 1.
- Specify a decision tree classification model with rangercomputational engine andimpurityfor variable importance. Prepare the model for tuning (i.e., fitting with a range of parameters for validation purposes).
Click for answer
Answer:
rf_model <- rand_forest(mtry = tune(),
                        trees = tune(),
                        min_n = tune()) %>% 
            set_engine('ranger', importance = "impurity") %>% 
            set_mode('classification')- Define a workflow object.
Click for answer
Answer:
rf_workflow <- workflow() %>% 
               add_model(rf_model) %>% 
               add_recipe(fire_recipe)- Create a grid of hyper parameter values to test. Try different values.
Click for answer
Answer:
rf_grid <- grid_random(mtry() %>% range_set(c(1, 8)),
                       trees(),
                       min_n(),
                       size = 10)- Tune the random forest workflow. Use the fire_foldsobject from before with 10 cross validation routine.
Click for answer
Answer:
rf_tuning <- rf_workflow %>% 
             tune_grid(resamples = fire_folds,
                       grid = rf_grid)- Select the best model based on accuracy.
Click for answer
Answer:
best_rf <- rf_tuning %>% 
           select_best(metric = 'accuracy')- Finalize the workflow, fit the model, and extract the parameters.
Click for answer
Answer:
final_rf_workflow <- rf_workflow %>% 
                     finalize_workflow(best_rf)
rf_wf_fit <- final_rf_workflow %>% 
             fit(data = fire_train)
rf_fit <- rf_wf_fit %>% 
          extract_fit_parsnip()- Plot the variable importance. What can you conclude from this plot?
Click for answer
Answer:
vip(rf_fit)