STAT 220
\[P(Y = 1 ) = \frac{e^{\beta_0 + \beta_1X_1}}{1 + e^{\beta_0 + \beta_1X_1}}\]
\[\text{Odds} = \frac{P(Y = 1 )}{1 - P(Y = 1 )} = e^{\beta_0 + \beta_1X_1}\]
\[\text{Log Odds} = \beta_0 + \beta_1X_1\]
# Create data split for train and test
set.seed(12345) # Ensures reproducibility of the data split
db_single <- db %>%
select(diabetes, glucose) %>%
# Relevels 'diabetes' factor to ensure 'neg' comes before 'pos'
mutate(diabetes = fct_relevel(diabetes, c("neg", "pos")))
db_split <- initial_split(db_single, prop = 0.80)
db_train <- db_split %>% training()
db_test <- db_split %>% testing()
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -5.61 0.678 -8.28 1.20e-16
2 glucose 0.0392 0.00514 7.62 2.55e-14
\[\log \left(\frac{p}{1-p}\right)=-5.61+0.0392 \cdot \text { glucose }\]
Where \(p = P(Y = 1)\) is the probability of having diabetes
A coefficient of 0.0392 for glucose means…
…a one-unit increase in glucose level increases the log odds of diabetes by 0.0392.
\[Odds = \frac{probability}{1 - probability}\]
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.00364 0.678 -8.28 1.20e-16
2 glucose 1.04 0.00514 7.62 2.55e-14
\[\text { Odds }=0.00364 \times(1.04)^{\text {glucose }}\]
An odds ratio of 1.04 means that for each additional unit of glucose, the odds of having diabetes increase by 4%.
db_recipe <- recipe(diabetes ~ ., data = db_train) %>%
step_scale(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
prep()
ca24-yourusername
repository from Github10:00
threshold = 0.30
# Prediction Probabilities
library(probably)
pred_prob <- predict(fitted_logistic_model,
new_data = db_test,
type = "prob")
db_results <- db_test %>% bind_cols(pred_prob) %>%
mutate(.pred_class = make_two_class_pred(.pred_neg,
levels(diabetes),
threshold = .30)) %>%
select(diabetes, glucose, contains(".pred"))
diabetes glucose .pred_neg .pred_pos .pred_class
25 pos 143 0.5040090 0.49599103 neg
52 neg 101 0.8402912 0.15970881 neg
60 neg 105 0.8181391 0.18186086 neg
64 neg 141 0.5235673 0.47643271 neg
69 neg 95 0.8693592 0.13064080 neg
83 neg 83 0.9141287 0.08587128 neg
98 neg 71 0.9445349 0.05546507 neg
110 pos 95 0.8693592 0.13064080 neg
135 neg 96 0.8648480 0.13515203 neg
143 neg 108 0.8000067 0.19999330 neg
custom_metrics <- metric_set(accuracy,
sensitivity,
specificity,
ppv)
custom_metrics(db_results,
truth = diabetes,
estimate = .pred_class)
# A tibble: 4 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.823
2 sensitivity binary 0.982
3 specificity binary 0.409
4 ppv binary 0.812
threshold = 0.70
# Prediction Probabilities
library(probably)
pred_prob <- predict(fitted_logistic_model,
new_data = db_test,
type = "prob")
db_results <- db_test %>% bind_cols(pred_prob) %>%
mutate(.pred_class = make_two_class_pred(.pred_neg,
levels(diabetes),
threshold = .70)) %>%
select(diabetes, glucose, contains(".pred"))
diabetes glucose .pred_neg .pred_pos .pred_class
25 pos 143 0.5040090 0.49599103 pos
52 neg 101 0.8402912 0.15970881 neg
60 neg 105 0.8181391 0.18186086 neg
64 neg 141 0.5235673 0.47643271 pos
69 neg 95 0.8693592 0.13064080 neg
83 neg 83 0.9141287 0.08587128 neg
98 neg 71 0.9445349 0.05546507 neg
110 pos 95 0.8693592 0.13064080 neg
135 neg 96 0.8648480 0.13515203 neg
143 neg 108 0.8000067 0.19999330 neg
custom_metrics <- metric_set(accuracy,
sensitivity,
specificity,
ppv)
custom_metrics(db_results,
truth = diabetes,
estimate = .pred_class)
# A tibble: 4 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.785
2 sensitivity binary 0.754
3 specificity binary 0.864
4 ppv binary 0.935
library(yardstick)
diabetes_prob <- predict(fitted_logistic_model, db_test, type = "prob")
diabetes_results <- db_test %>% select(diabetes) %>% bind_cols(diabetes_prob)
diabetes_results %>%
roc_curve(truth = diabetes, .pred_neg) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_line(color = "#1f77b4", size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
annotate("text", x = 0.8, y = 0.1, label = paste("AUC =", round(roc_auc(diabetes_results, truth = diabetes, .pred_neg)$.estimate, 3)), hjust = 1, color = "#ff7f0e", size = 5, fontface = "bold") +
labs(title = "ROC Curve", subtitle = "Performance of the Logistic Regression Model", x = "False Positive Rate (1 - specificity)", y = "True Positive Rate (sensitivity)") +
theme_minimal()
# A tibble: 58 × 3
.threshold specificity sensitivity
<dbl> <dbl> <dbl>
1 -Inf 0 1
2 0.106 0 1
3 0.113 0.0455 1
4 0.154 0.0909 1
5 0.159 0.182 1
6 0.187 0.227 1
7 0.193 0.273 1
8 0.199 0.318 1
9 0.218 0.364 1
10 0.239 0.409 1
# ℹ 48 more rows
# Compute the ROC curve
roc_curve(diabetes_results, truth = diabetes, .pred_neg) %>%
# Look for the point where specificity is 0.87 and sensitivity is 0.76
arrange(abs(specificity - (1 - 0.13)), abs(sensitivity - 0.76))
# A tibble: 58 × 3
.threshold specificity sensitivity
<dbl> <dbl> <dbl>
1 0.706 0.864 0.754
2 0.698 0.864 0.772
3 0.690 0.864 0.807
4 0.681 0.864 0.825
5 0.714 0.909 0.754
6 0.738 0.909 0.737
7 0.745 0.909 0.719
8 0.760 0.909 0.702
9 0.767 0.909 0.684
10 0.774 0.909 0.667
# ℹ 48 more rows
# Compute the ROC curve
roc_curve(diabetes_results, truth = diabetes, .pred_neg) %>%
# Find point where specificity is 0.87 and sensitivity is 0.76
arrange(abs(specificity - (1 - 0.13)), abs(sensitivity - 0.76)) %>%
slice(1)
# A tibble: 1 × 3
.threshold specificity sensitivity
<dbl> <dbl> <dbl>
1 0.706 0.864 0.754
10:00