# load the necessary libraries
library(tidyverse)
library(tidymodels)
library(mlbench)
library(janitor)
library(paletteer)
library(corrr)
library(forcats)
library(ggthemes)
Class Activity 21
Group Activity 1
# Load the data
data(PimaIndiansDiabetes2)
<- PimaIndiansDiabetes2 %>% drop_na()
db
# correlation plot of the variables
%>%
db select(-diabetes) %>% # only numerical variables
correlate() %>%
stretch() %>%
ggplot(aes(x, y, fill = r)) +
geom_tile() +
geom_text(aes(label = as.character(fashion(r)))) +
scale_fill_paletteer_c("scico::roma", limits = c(-1, 1), direction = -1)
- Create a scatter plot using ggplot2 to visualize the classification of diabetes status based on glucose and insulin levels, color-coding negative cases in blue and positive cases in red.
ggplot(db, aes(x = glucose, y = insulin, color = diabetes)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(title = "Glucose vs. Insulin by Diabetes Status",
x = "Glucose",
y = "Insulin",
color = "Diabetes Status") +
scale_color_manual(values = c("neg" = "blue", "pos" = "red"))
- Using the provided standardization function, apply it to both the glucose and insulin columns of your dataset to create new standardized columns, then plot these standardized values to analyze diabetes status.
# function that standardizes
<- function(x, na.rm = FALSE) {
standardize - mean(x, na.rm = na.rm)) /
(x sd(x, na.rm = na.rm)
}
<- db %>%
db_std mutate(glucose_std = standardize(glucose),
insulin_std = standardize(insulin))
- Let’s perform all the steps involved in classifying whether a patient with certain glucose and insulin would have diabetes or not.
Click for answer
Answer:
# 1 Prepare raw data
<- db %>% select(glucose, insulin, diabetes) db_raw
# 2 Create a recipe for data pre-processing
<- recipe(diabetes ~ glucose + insulin, data = db_raw) %>%
db_recipe step_scale(all_predictors()) %>%
step_center(all_predictors()) %>%
prep()
# 3 Apply the recipe to the data set
<- bake(db_recipe, db_raw) db_scaled
# 4 Create a model specification
<- nearest_neighbor(mode = "classification",
knn_spec engine = "kknn",
neighbors = 5)
# 5 Fit the model on the pre-processed data
<- knn_spec %>%
knn_fit fit(diabetes ~ insulin + diabetes, data = db_scaled)
# 6 Classify
# These are standardized value!!
<- tibble(glucose = c(1, 2), insulin = c(-1, 1))
new_observations predict(knn_fit, new_data = new_observations)
# A tibble: 2 × 1
.pred_class
<fct>
1 neg
2 neg
- We already know the labels of the patients in the dataset. How well does the model predict their diabetes status?
Click for answer
Answer:
<- db_scaled[1:50,]
scaled_observations <- predict(knn_fit, new_data = scaled_observations)
predictions bind_cols(scaled_observations, predictions) -> predict_data
What is the accuracy percentage?
Answer:
sum(predictions == db_raw %>% select(diabetes) %>% slice(1:50))/50
[1] 0.9
# alternate
<- predict_data %>%
accuracy_percentage mutate(correct_prediction = diabetes == .pred_class) %>%
summarize(accuracy = mean(correct_prediction, na.rm = TRUE)) %>%
pull(accuracy) * 100
accuracy_percentage
[1] 90
- Repeat part d. with a different model fitted with different number of neighbors. See if the accuracy percentage change in this new setting.
Click for answer
Answer:
<- nearest_neighbor(mode = "classification",
knn_spec engine = "kknn",
weight_func = "rectangular",
neighbors = 2)
<- knn_spec %>%
knn_fit fit(diabetes ~ ., data = db_scaled)
<- db_scaled
scaled_observations <- predict(knn_fit, new_data = scaled_observations)
predictions
bind_cols(scaled_observations, predictions) -> predict_data
# accuracy percentage
<- predict_data %>%
accuracy_percentage mutate(correct_prediction = diabetes == .pred_class) %>%
summarize(accuracy = mean(correct_prediction, na.rm = TRUE)) %>%
pull(accuracy) * 100
accuracy_percentage
[1] 100