# load the necessary libraries
library(tidyverse)
library(tidymodels)
library(mlbench) # for PimaIndiansDiabetes2 dataset
library(janitor)
library(yardstick)
library(parsnip) # tidy interface to models
library(ggthemes)
library(forcats)
library(probably)
library(yardstick)
Class Activity 23
Group Activity 1
Load the mlbench
package to get PimaIndiansDiabetes2
dataset.
# Load the data - diabetes
data(PimaIndiansDiabetes2)
<- PimaIndiansDiabetes2
db <- db %>% drop_na()
db <- db %>% select(glucose, insulin, diabetes)
db_raw
<- initial_split(db_raw, prop = 0.80)
db_split # Create training data
<- db_split %>% training()
db_train # Create testing data
<- db_split %>% testing() db_test
a. Creating the Recipe: Construct a recipe for the model by normalizing glucose
and insulin
predictors to predict diabetes
status on the training set, ensuring data scales are comparable.
Click for answer
Answer:
<- recipe(diabetes ~ glucose + insulin, data = db_train) %>%
db_recipe step_scale(all_predictors()) %>%
step_center(all_predictors())
b. Model Specification: Define the KNN model using a flexible tune()
placeholder for the number of neighbors, specifying a classification
task.
Click for answer
Answer:
<- nearest_neighbor(weight_func = "rectangular",
knn_spec engine = "kknn",
mode = "classification",
neighbors = tune())
c. Creating Folds: Divide the training data into 10 stratified folds based on the diabetes outcome to prepare for cross-validation, ensuring representation.
Click for answer
Answer:
<- vfold_cv(db_train, v = 10, strata = diabetes) db_vfold
d. Cross-Validation Grid: Generate a sequence of K values to test with 10-fold cross-validation, evaluating model performance across a range of neighbors.
Click for answer
Answer:
<- tibble(neighbors = seq(from = 1, to = 40, by = 1)) k_vals
<- workflow() %>%
knn_fit add_recipe(db_recipe) %>%
add_model(knn_spec) %>%
tune_grid(
resamples = db_vfold,
grid = k_vals,
metrics = metric_set(yardstick::ppv, yardstick::accuracy, sens, spec),
control = control_resamples(save_pred = TRUE))
<- collect_metrics(knn_fit)
cv_metrics %>% group_by(.metric) %>% slice_max(mean) cv_metrics
# A tibble: 8 × 7
# Groups: .metric [4]
neighbors .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 25 accuracy binary 0.751 10 0.0167 Preprocessor1_Model25
2 26 accuracy binary 0.751 10 0.0167 Preprocessor1_Model26
3 7 ppv binary 0.791 10 0.0286 Preprocessor1_Model07
4 8 ppv binary 0.791 10 0.0286 Preprocessor1_Model08
5 37 sens binary 0.910 10 0.0240 Preprocessor1_Model37
6 38 sens binary 0.910 10 0.0240 Preprocessor1_Model38
7 5 spec binary 0.526 10 0.0582 Preprocessor1_Model05
8 6 spec binary 0.526 10 0.0582 Preprocessor1_Model06
e. Visualization: Plot the cross-validation results to determine the optimal K value, comparing different performance metrics visually.
Click for answer
Answer:
<- cv_metrics %>% mutate(.metric = as.factor(.metric)) %>%
final.results select(neighbors, .metric, mean)
%>%
final.results ggplot(aes(x = neighbors, y = mean, color = forcats::fct_reorder2(.metric, neighbors, mean))) +
geom_line(size = 1) +
geom_point(size = 2) +
theme_minimal() +
scale_color_wsj() +
scale_x_continuous(breaks = k_vals[[1]]) +
theme(panel.grid.minor.x = element_blank())+
labs(color='Metric', y = "Estimate", x = "K")
Group Activity 2
a. Data Preparation and Train-Test Split
Load the mlbench
package and tidymodels
framework, select relevant features for predicting glucose
, and split the data into training and test sets. For this activity, use mass
and insulin
as your features.
Click for answer
Answer:
library(mlbench)
library(tidymodels)
library(dplyr)
data(PimaIndiansDiabetes2)
<- PimaIndiansDiabetes2 %>%
db drop_na() %>%
select(glucose, mass, insulin)
# Splitting the data
set.seed(2056)
<- initial_split(db, prop = 0.75, strata = glucose)
db_split <- training(db_split)
db_train <- testing(db_split) db_test
b. Model Specification
Define a linear regression model for predicting glucose
as a function of mass
and insulin.
c. Fit the Model
Fit the linear model to the training data, predicting glucose
based on mass
and insulin.
d. Predict on Test Data and Evaluate the Model
Use the fitted model to predict glucose
levels on the test set and evaluate the model’s accuracy with RMSE and R-squared metrics.