# load the necessary libraries
Class Activity 21
Group Activity 1
# Load the data
<- PimaIndiansDiabetes2 %>% drop_na()
# correlation plot of the variables
db select(-diabetes) %>% # only numerical variables
correlate() %>%
stretch() %>%
ggplot(aes(x, y, fill = r)) +
geom_tile() +
geom_text(aes(label = as.character(fashion(r)))) +
scale_fill_paletteer_c("scico::roma", limits = c(-1, 1), direction = -1)
- Create a scatter plot using ggplot2 to visualize the classification of diabetes status based on glucose and insulin levels, color-coding negative cases in blue and positive cases in red.
ggplot(db, aes(x = glucose, y = insulin, color = diabetes)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(title = "Glucose vs. Insulin by Diabetes Status",
x = "Glucose",
y = "Insulin",
color = "Diabetes Status") +
scale_color_manual(values = c("neg" = "blue", "pos" = "red"))
- Using the provided standardization function, apply it to both the glucose and insulin columns of your dataset to create new standardized columns, then plot these standardized values to analyze diabetes status.
# function that standardizes
<- function(x, na.rm = FALSE) {
standardize - mean(x, na.rm = na.rm)) /
(x sd(x, na.rm = na.rm)
<- db %>%
db_std mutate(glucose_std = standardize(glucose),
insulin_std = standardize(insulin))
- Let’s perform all the steps involved in classifying whether a patient with certain glucose and insulin would have diabetes or not.
Click for answer
# 1 Prepare raw data
<- db %>% select(glucose, insulin, diabetes) db_raw
# 2 Create a recipe for data pre-processing
<- recipe(diabetes ~ glucose + insulin, data = db_raw) %>%
db_recipe step_scale(all_predictors()) %>%
step_center(all_predictors()) %>%
# 3 Apply the recipe to the data set
<- bake(db_recipe, db_raw) db_scaled
# 4 Create a model specification
<- nearest_neighbor(mode = "classification",
knn_spec engine = "kknn",
neighbors = 5)
# 5 Fit the model on the pre-processed data
<- knn_spec %>%
knn_fit fit(diabetes ~ insulin + diabetes, data = db_scaled)
# 6 Classify
# These are standardized value!!
<- tibble(glucose = c(1, 2), insulin = c(-1, 1))
new_observations predict(knn_fit, new_data = new_observations)
# A tibble: 2 × 1
1 neg
2 neg
- We already know the labels of the patients in the dataset. How well does the model predict their diabetes status?
Click for answer
<- db_scaled[1:50,]
scaled_observations <- predict(knn_fit, new_data = scaled_observations)
predictions bind_cols(scaled_observations, predictions) -> predict_data
What is the accuracy percentage?
sum(predictions == db_raw %>% select(diabetes) %>% slice(1:50))/50
[1] 0.9
# alternate
<- predict_data %>%
accuracy_percentage mutate(correct_prediction = diabetes == .pred_class) %>%
summarize(accuracy = mean(correct_prediction, na.rm = TRUE)) %>%
pull(accuracy) * 100
[1] 90
- Repeat part d. with a different model fitted with different number of neighbors. See if the accuracy percentage change in this new setting.
Click for answer
<- nearest_neighbor(mode = "classification",
knn_spec engine = "kknn",
weight_func = "rectangular",
neighbors = 2)
<- knn_spec %>%
knn_fit fit(diabetes ~ ., data = db_scaled)
<- db_scaled
scaled_observations <- predict(knn_fit, new_data = scaled_observations)
bind_cols(scaled_observations, predictions) -> predict_data
# accuracy percentage
<- predict_data %>%
accuracy_percentage mutate(correct_prediction = diabetes == .pred_class) %>%
summarize(accuracy = mean(correct_prediction, na.rm = TRUE)) %>%
pull(accuracy) * 100
[1] 100