Class Activity 23

# load the necessary libraries
library(tidyverse) 
library(tidymodels)
library(mlbench)    # for PimaIndiansDiabetes2 dataset
library(janitor)
library(yardstick) 
library(parsnip) # tidy interface to models
library(ggthemes)
library(forcats)
library(probably)
library(yardstick)

Group Activity 1

Load the mlbench package to get PimaIndiansDiabetes2 dataset.

# Load the data - diabetes
data(PimaIndiansDiabetes2)
db <- PimaIndiansDiabetes2
db <- db %>% drop_na() 
db_raw <- db %>% select(glucose, insulin, diabetes)

db_split <- initial_split(db_raw, prop = 0.80)
# Create training data
db_train <- db_split %>% training()
# Create testing data
db_test <- db_split %>% testing()

a. Creating the Recipe: Construct a recipe for the model by normalizing glucose and insulin predictors to predict diabetes status on the training set, ensuring data scales are comparable.

Click for answer

Answer:

db_recipe <- recipe(diabetes ~  glucose + insulin, data = db_train) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())

b. Model Specification: Define the KNN model using a flexible tune() placeholder for the number of neighbors, specifying a classification task.

Click for answer

Answer:

knn_spec <- nearest_neighbor(weight_func = "rectangular", 
                             engine = "kknn",
                             mode = "classification",
                             neighbors = tune())

c. Creating Folds: Divide the training data into 10 stratified folds based on the diabetes outcome to prepare for cross-validation, ensuring representation.

Click for answer

Answer:

db_vfold <- vfold_cv(db_train, v = 10, strata = diabetes)

d. Cross-Validation Grid: Generate a sequence of K values to test with 10-fold cross-validation, evaluating model performance across a range of neighbors.

Click for answer

Answer:

k_vals <- tibble(neighbors = seq(from = 1, to = 40, by = 1))
knn_fit <- workflow() %>%
  add_recipe(db_recipe) %>%
  add_model(knn_spec) %>%
  tune_grid(
    resamples = db_vfold, 
    grid = k_vals,
    metrics = metric_set(yardstick::ppv, yardstick::accuracy, sens, spec),
    control = control_resamples(save_pred = TRUE))
cv_metrics <- collect_metrics(knn_fit) 
cv_metrics %>% group_by(.metric) %>% slice_max(mean) 
# A tibble: 8 × 7
# Groups:   .metric [4]
  neighbors .metric  .estimator  mean     n std_err .config              
      <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
1        25 accuracy binary     0.751    10  0.0167 Preprocessor1_Model25
2        26 accuracy binary     0.751    10  0.0167 Preprocessor1_Model26
3         7 ppv      binary     0.791    10  0.0286 Preprocessor1_Model07
4         8 ppv      binary     0.791    10  0.0286 Preprocessor1_Model08
5        37 sens     binary     0.910    10  0.0240 Preprocessor1_Model37
6        38 sens     binary     0.910    10  0.0240 Preprocessor1_Model38
7         5 spec     binary     0.526    10  0.0582 Preprocessor1_Model05
8         6 spec     binary     0.526    10  0.0582 Preprocessor1_Model06

e. Visualization: Plot the cross-validation results to determine the optimal K value, comparing different performance metrics visually.

Click for answer

Answer:

final.results <- cv_metrics %>%  mutate(.metric = as.factor(.metric)) %>%
  select(neighbors, .metric, mean)

final.results %>%
  ggplot(aes(x = neighbors, y = mean, color = forcats::fct_reorder2(.metric, neighbors, mean))) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  theme_minimal() +
  scale_color_wsj() + 
  scale_x_continuous(breaks = k_vals[[1]]) +
  theme(panel.grid.minor.x = element_blank())+
  labs(color='Metric', y = "Estimate", x = "K")


Group Activity 2

a. Data Preparation and Train-Test Split

Load the mlbench package and tidymodels framework, select relevant features for predicting glucose, and split the data into training and test sets. For this activity, use mass and insulin as your features.

Click for answer

Answer:

library(mlbench)
library(tidymodels)
library(dplyr)

data(PimaIndiansDiabetes2)
db <- PimaIndiansDiabetes2 %>% 
  drop_na() %>%
  select(glucose, mass, insulin)

# Splitting the data
set.seed(2056)
db_split <- initial_split(db, prop = 0.75, strata = glucose)
db_train <- training(db_split)
db_test <- testing(db_split)

b. Model Specification

Define a linear regression model for predicting glucose as a function of mass and insulin.

c. Fit the Model

Fit the linear model to the training data, predicting glucose based on mass and insulin.

d. Predict on Test Data and Evaluate the Model

Use the fitted model to predict glucose levels on the test set and evaluate the model’s accuracy with RMSE and R-squared metrics.

(Bonus) Create a scatter plot to visualize the actual vs. predicted glucose levels, including a regression line for reference.