# load the necessary libraries
library(tidyverse)
library(tidymodels)
library(mlbench) # for PimaIndiansDiabetes2 dataset
library(janitor)
library(parsnip)
library(kknn)
library(ggthemes)
library(purrr)
library(forcats)Class Activity 22
Group Activity 1
Load the mlbench package to get PimaIndiansDiabetes2 dataset.
# Load the data - diabetes
data(PimaIndiansDiabetes2)
db <- PimaIndiansDiabetes2
db <- db %>% drop_na() %>% mutate(diabetes = fct_rev(factor(diabetes)))
db_raw <- db %>% select(glucose, insulin, diabetes)- Split the data
75-25into training and test set using the following code.
Click for answer
Answer:
set.seed(123)
db_split <- initial_split(db, prop = 0.75)
# Create training data
db_train <- db_split %>% training()
# Create testing data
db_test <- db_split %>% testing()- Follow the steps to train a 7-NN classifier using the
tidymodelstoolkit
Click for answer
Answer:
# define recipe and preprocess the data
db_recipe <- recipe(diabetes ~ ., data = db_raw) %>%
step_scale(all_predictors()) %>%
step_center(all_predictors()) # specify the model
db_knn_spec7 <- nearest_neighbor(mode = "classification",
engine = "kknn",
weight_func = "rectangular",
neighbors = 7)# define the workflow
db_workflow <- workflow() %>%
add_recipe(db_recipe) %>%
add_model(db_knn_spec7)# fit the model
db_fit <- fit(db_workflow, data = db_train)- Classify the penguins in the
testdata frame.
Click for answer
Answer:
test_features <- db_test %>% select(glucose, insulin)
db_pred <- predict(db_fit, test_features, type = "raw")
db_results <- db_test %>%
select(glucose, insulin, diabetes) %>%
bind_cols(predicted = db_pred)
head(db_results, 6) glucose insulin diabetes predicted
4 89 94 neg neg
7 78 88 pos neg
15 166 175 pos pos
19 103 83 neg neg
32 158 245 pos pos
36 103 192 neg neg
Group Activity 2
Calculate the accuracy, sensitivity, specificity, and positive predictive value by hand using the following confusion matrix.
conf_mat(db_results, truth = diabetes, estimate = predicted) Truth
Prediction pos neg
pos 17 8
neg 12 61
Click for answer
Answer:
accuracy(db_results, truth = diabetes,
estimate = predicted)# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.796
sens(db_results, truth = diabetes,
estimate = predicted)# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 sens binary 0.586
spec(db_results, truth = diabetes,
estimate = predicted)# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 spec binary 0.884
ppv(db_results, truth = diabetes,
estimate = predicted)# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 ppv binary 0.68
Code to recreate the plot in the slides for the diabetes dataset.
Click for answer
Answer:
metrics_for_k <- function(k, db_train, db_test){
db_knn_spec <- nearest_neighbor(mode = "classification",
engine = "kknn",
weight_func = "rectangular",
neighbors = k)
db_knn_wkflow <- workflow() %>%
add_recipe(db_recipe) %>%
add_model(db_knn_spec)
db_knn_fit <- fit(db_knn_wkflow, data = db_train)
test_features <- db_test %>% select(glucose, insulin)
nn1_pred <- predict(db_knn_fit, test_features, type = "raw")
db_results <- db_test %>%
select(diabetes) %>%
bind_cols(predicted = nn1_pred)
custom_metrics <- metric_set(accuracy, sens, spec, ppv)
metrics <- custom_metrics(db_results,
truth = diabetes,
estimate = predicted)
metrics <- metrics %>% select(-.estimator) %>% mutate(k = rep(k,4))
return(list = metrics)
}k <- seq(1,40, by=1)
optim.results <- purrr::map_dfr(k, ~metrics_for_k(.x, db_train, db_test)) optim.results %>%
ggplot(aes(x = k, y = .estimate, color = forcats::fct_reorder2(.metric, k, .estimate ))) +
geom_line(size = 1) +
geom_point(size = 2) +
theme_minimal() +
ggthemes::scale_color_wsj() +
scale_x_continuous(breaks = k) +
theme(panel.grid.minor.x = element_blank(),
axis.text=element_text(size=6, angle = 20))+
labs(color='Metric', y = "Estimate", x = "K") 