# load the necessary libraries
library(tidyverse)
library(tidymodels)
library(mlbench) # for PimaIndiansDiabetes2 dataset
library(janitor)
library(parsnip)
library(kknn)
library(ggthemes)
library(purrr)
library(forcats)
Class Activity 22
Group Activity 1
Load the mlbench
package to get PimaIndiansDiabetes2
dataset.
# Load the data - diabetes
data(PimaIndiansDiabetes2)
<- PimaIndiansDiabetes2
db <- db %>% drop_na() %>% mutate(diabetes = fct_rev(factor(diabetes)))
db <- db %>% select(glucose, insulin, diabetes) db_raw
- Split the data
75-25
into training and test set using the following code.
Click for answer
Answer:
set.seed(123)
<- initial_split(db, prop = 0.75)
db_split
# Create training data
<- db_split %>% training()
db_train
# Create testing data
<- db_split %>% testing() db_test
- Follow the steps to train a 7-NN classifier using the
tidymodels
toolkit
Click for answer
Answer:
# define recipe and preprocess the data
<- recipe(diabetes ~ ., data = db_raw) %>%
db_recipe step_scale(all_predictors()) %>%
step_center(all_predictors())
# specify the model
<- nearest_neighbor(mode = "classification",
db_knn_spec7 engine = "kknn",
weight_func = "rectangular",
neighbors = 7)
# define the workflow
<- workflow() %>%
db_workflow add_recipe(db_recipe) %>%
add_model(db_knn_spec7)
# fit the model
<- fit(db_workflow, data = db_train) db_fit
- Classify the penguins in the
test
data frame.
Click for answer
Answer:
<- db_test %>% select(glucose, insulin)
test_features <- predict(db_fit, test_features, type = "raw")
db_pred
<- db_test %>%
db_results select(glucose, insulin, diabetes) %>%
bind_cols(predicted = db_pred)
head(db_results, 6)
glucose insulin diabetes predicted
4 89 94 neg neg
7 78 88 pos neg
15 166 175 pos pos
19 103 83 neg neg
32 158 245 pos pos
36 103 192 neg neg
Group Activity 2
Calculate the accuracy, sensitivity, specificity, and positive predictive value by hand using the following confusion matrix.
conf_mat(db_results, truth = diabetes, estimate = predicted)
Truth
Prediction pos neg
pos 17 8
neg 12 61
Click for answer
Answer:
accuracy(db_results, truth = diabetes,
estimate = predicted)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.796
sens(db_results, truth = diabetes,
estimate = predicted)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 sens binary 0.586
spec(db_results, truth = diabetes,
estimate = predicted)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 spec binary 0.884
ppv(db_results, truth = diabetes,
estimate = predicted)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 ppv binary 0.68
Code to recreate the plot in the slides
for the diabetes
dataset.
Click for answer
Answer:
<- function(k, db_train, db_test){
metrics_for_k <- nearest_neighbor(mode = "classification",
db_knn_spec engine = "kknn",
weight_func = "rectangular",
neighbors = k)
<- workflow() %>%
db_knn_wkflow add_recipe(db_recipe) %>%
add_model(db_knn_spec)
<- fit(db_knn_wkflow, data = db_train)
db_knn_fit <- db_test %>% select(glucose, insulin)
test_features <- predict(db_knn_fit, test_features, type = "raw")
nn1_pred
<- db_test %>%
db_results select(diabetes) %>%
bind_cols(predicted = nn1_pred)
<- metric_set(accuracy, sens, spec, ppv)
custom_metrics
<- custom_metrics(db_results,
metrics truth = diabetes,
estimate = predicted)
<- metrics %>% select(-.estimator) %>% mutate(k = rep(k,4))
metrics
return(list = metrics)
}
<- seq(1,40, by=1)
k <- purrr::map_dfr(k, ~metrics_for_k(.x, db_train, db_test)) optim.results
%>%
optim.results ggplot(aes(x = k, y = .estimate, color = forcats::fct_reorder2(.metric, k, .estimate ))) +
geom_line(size = 1) +
geom_point(size = 2) +
theme_minimal() +
::scale_color_wsj() +
ggthemesscale_x_continuous(breaks = k) +
theme(panel.grid.minor.x = element_blank(),
axis.text=element_text(size=6, angle = 20))+
labs(color='Metric', y = "Estimate", x = "K")