# load the necessary libraries
library(tidyverse) 
library(tidymodels)
library(mlbench)    # for PimaIndiansDiabetes2 dataset
library(janitor)
library(yardstick) 
library(parsnip) # tidy interface to models
library(ggthemes)
library(forcats)
library(probably)
library(yardstick)Class Activity 23
Group Activity 1
Load the mlbench package to get PimaIndiansDiabetes2 dataset.
# Load the data - diabetes
data(PimaIndiansDiabetes2)
db <- PimaIndiansDiabetes2
db <- db %>% drop_na() 
db_raw <- db %>% select(glucose, insulin, diabetes)
db_split <- initial_split(db_raw, prop = 0.80)
# Create training data
db_train <- db_split %>% training()
# Create testing data
db_test <- db_split %>% testing()a. Creating the Recipe: Construct a recipe for the model by normalizing glucose and insulin predictors to predict diabetes status on the training set, ensuring data scales are comparable.
Click for answer
Answer:
db_recipe <- recipe(diabetes ~  glucose + insulin, data = db_train) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())b. Model Specification: Define the KNN model using a flexible tune() placeholder for the number of neighbors, specifying a classification task.
Click for answer
Answer:
knn_spec <- nearest_neighbor(weight_func = "rectangular", 
                             engine = "kknn",
                             mode = "classification",
                             neighbors = tune())c. Creating Folds: Divide the training data into 10 stratified folds based on the diabetes outcome to prepare for cross-validation, ensuring representation.
Click for answer
Answer:
db_vfold <- vfold_cv(db_train, v = 10, strata = diabetes)d. Cross-Validation Grid: Generate a sequence of K values to test with 10-fold cross-validation, evaluating model performance across a range of neighbors.
Click for answer
Answer:
k_vals <- tibble(neighbors = seq(from = 1, to = 40, by = 1))knn_fit <- workflow() %>%
  add_recipe(db_recipe) %>%
  add_model(knn_spec) %>%
  tune_grid(
    resamples = db_vfold, 
    grid = k_vals,
    metrics = metric_set(yardstick::ppv, yardstick::accuracy, sens, spec),
    control = control_resamples(save_pred = TRUE))cv_metrics <- collect_metrics(knn_fit) 
cv_metrics %>% group_by(.metric) %>% slice_max(mean) # A tibble: 8 × 7
# Groups:   .metric [4]
  neighbors .metric  .estimator  mean     n std_err .config              
      <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
1        25 accuracy binary     0.751    10  0.0167 Preprocessor1_Model25
2        26 accuracy binary     0.751    10  0.0167 Preprocessor1_Model26
3         7 ppv      binary     0.791    10  0.0286 Preprocessor1_Model07
4         8 ppv      binary     0.791    10  0.0286 Preprocessor1_Model08
5        37 sens     binary     0.910    10  0.0240 Preprocessor1_Model37
6        38 sens     binary     0.910    10  0.0240 Preprocessor1_Model38
7         5 spec     binary     0.526    10  0.0582 Preprocessor1_Model05
8         6 spec     binary     0.526    10  0.0582 Preprocessor1_Model06
e. Visualization: Plot the cross-validation results to determine the optimal K value, comparing different performance metrics visually.
Click for answer
Answer:
final.results <- cv_metrics %>%  mutate(.metric = as.factor(.metric)) %>%
  select(neighbors, .metric, mean)
final.results %>%
  ggplot(aes(x = neighbors, y = mean, color = forcats::fct_reorder2(.metric, neighbors, mean))) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  theme_minimal() +
  scale_color_wsj() + 
  scale_x_continuous(breaks = k_vals[[1]]) +
  theme(panel.grid.minor.x = element_blank())+
  labs(color='Metric', y = "Estimate", x = "K")
Group Activity 2
a. Data Preparation and Train-Test Split
Load the mlbench package and tidymodels framework, select relevant features for predicting glucose, and split the data into training and test sets. For this activity, use mass and insulin as your features.
Click for answer
Answer:
library(mlbench)
library(tidymodels)
library(dplyr)
data(PimaIndiansDiabetes2)
db <- PimaIndiansDiabetes2 %>% 
  drop_na() %>%
  select(glucose, mass, insulin)
# Splitting the data
set.seed(2056)
db_split <- initial_split(db, prop = 0.75, strata = glucose)
db_train <- training(db_split)
db_test <- testing(db_split)b. Model Specification
Define a linear regression model for predicting glucose as a function of mass and insulin.
c. Fit the Model
Fit the linear model to the training data, predicting glucose based on mass and insulin.
d. Predict on Test Data and Evaluate the Model
Use the fitted model to predict glucose levels on the test set and evaluate the model’s accuracy with RMSE and R-squared metrics.