# load the necessary libraries
library(tidyverse)
library(ggthemes)
library(factoextra)
library(janitor)
library(broom)
<- dplyr::select
select theme_set(theme_stata(base_size = 10))
<- function(x, na.rm = FALSE) {
standardize - mean(x, na.rm = na.rm)) / sd(x, na.rm = na.rm)
(x }
Class Activity 26
Group Activity 1
Let’s look at the following data tibble that randomly creates some x-
and y-
coordinates around the cluster centroids that we just saw in class. Please answer the questions based on this data.
set.seed(1234)
<- tibble(
my_df X1 = rnorm(n = 120, mean = rep(c(2, 4, 7.33), each = 40)),
X2 = rnorm(n = 120, mean = rep(c(6.33, 3, 6), each = 40))
)
%>%
my_df ggplot(aes(X1, X2)) +
geom_point()
- How many clusters can you identify in the data?
Click for answer
Answer: Answers may vary
- Fit
kmeans
algorithm to the data picking the number of clusters you previously identified in parta
.
Click for answer
Answer:
set.seed(1234)
<- kmeans(my_df, centers = 3, nstart = 25) res_kmeans
- Add the cluster association to the dataset and make a scatter plot color-coded by the cluster association.
Click for answer
Answer:
augment(res_kmeans, data = my_df) %>%
ggplot(aes(X1, X2, color = .cluster)) +
geom_point()
- Repeat parts
b-c
for identifying more number of clusters than what you picked in part a.
Click for answer
Answer:
set.seed(1234)
<- kmeans(my_df, centers = 5, nstart = 25) res_kmeans
augment(res_kmeans, data = my_df) %>%
ggplot(aes(X1, X2, color = .cluster)) +
geom_point()
- Aggregate the total within sum of squares for each k to the data table
multi_kmeans
.
Click for answer
Answer:
<- tibble(k = 1:10) %>%
multi_kmeans mutate(
model = purrr::map(k, ~ kmeans(my_df, centers = .x, nstart = 25)),
tot.withinss = purrr::map_dbl(model, ~ glance(.x)$tot.withinss)
)
- Make an elbow plot modifying the code below:
Click for answer
Answer:
%>%
multi_kmeans ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line()+
scale_x_continuous(breaks = 1:15)
- After picking an optimal number of cluster, use the in-built function in the
factoextra
package to construct the final cluster plot.
Click for answer
Answer:
set.seed(1234)
<- kmeans(my_df, 5, nstart = 25)
kmeans.final fviz_cluster(kmeans.final, data = my_df, ggtheme = theme_stata())
(Extra) Group Activity 2
Let’s look at the following data tibble that randomly creates some x-
and y-
coordinates around the cluster centroids. Now, there are more clusters and the data points are closer to each other. Please repeat the analysis as seen above to find the optimal number of clusters.
set.seed(1234)
<- tibble(
my_df X1 = rnorm(n = 240, mean = rep(c(2, 4, 7.33, 2.5, 5, 6), each = 40)),
X2 = rnorm(n = 240, mean = rep(c(6.33, 3, 6, 3.5, 4.5, 5.5), each = 40))
)
%>%
my_df ggplot(aes(X1, X2)) +
geom_point()
Click for answer
Answer:
- How many clusters can you identify in the data?
Answer: Answers may vary!
- Fit
kmeans
algorithm to the data picking the number of clusters you previously identified in part a.
set.seed(1234)
<- kmeans(my_df, centers = 6, nstart = 25) res_kmeans
- Add the cluster association to the dataset and make a scatter plot color-coded by the cluster association.
augment(res_kmeans, data = my_df) %>%
ggplot(aes(X1, X2, color = .cluster)) +
geom_point()
- Repeat parts b-c for identifying more number of clusters than what you picked in part a.
set.seed(1234)
<- kmeans(my_df, centers = 8, nstart = 25) res_kmeans
augment(res_kmeans, data = my_df) %>%
ggplot(aes(X1, X2, color = .cluster)) +
geom_point()
- Aggregate the total within sum of squares for each k to the data table
multi_kmeans.
<- tibble(k = 1:15) %>%
multi_kmeans mutate(
model = purrr::map(k, ~ kmeans(my_df, centers = .x, nstart = 25)),
tot.withinss = purrr::map_dbl(model, ~ glance(.x)$tot.withinss)
)
- Make an elbow plot modifying the code below:
%>%
multi_kmeans ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line() +
scale_x_continuous(breaks = 1:15)
- After picking an optimal number of cluster, use the in-built function in the
factoextra
package to construct the final cluster plot.
set.seed(1234)
<- kmeans(my_df, 8, nstart = 25)
kmeans.final fviz_cluster(kmeans.final, data = my_df, ggtheme = theme_stata())