mydata <- tibble(
id = 1:4,
grade=c("9th","10th","11th","9th")) %>%
mutate(grade_fac = factor(grade))
levels(mydata$grade_fac)
[1] "10th" "11th" "9th"
STAT 220
See forcats
cheatsheet and forcats
vignette
fct_relevel()
fct_collapse()
and fct_lump()
mydata <- tibble(loc = c("SW","NW","NW","NE","SE","SE"))
mydata %>% mutate(
loc_fac = factor(loc),
loc2 = fct_collapse(loc_fac, # collapse levels
south = c("SW","SE"),
north = c("NE","NW")),
loc3 = fct_lump(loc_fac,
n=2,
other_level = "other") # most common 2 levels + other
)
# A tibble: 6 × 4
loc loc_fac loc2 loc3
<chr> <fct> <fct> <fct>
1 SW SW south other
2 NW NW north NW
3 NW NW north NW
4 NE NE north other
5 SE SE south SE
6 SE SE south SE
fct_infreq()
fct_infreq()
: This function orders factor levels by their frequency in the data.
fct_rev()
: Reverse the order of factor levelsfct_anon()
fct_anon()
: Anonymize factor levels by replacing them with unique, randomly generated character strings.
ca10-yourusername
repository from Github10:00
gss_cat
A sample of data from the General Social Survey, a long-running US survey conducted by NORC at the University of Chicago.
# A tibble: 21,483 × 9
year marital age race rincome partyid relig denom tvhours
<int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
1 2000 Never married 26 White $8000 to 9999 Ind,near … Prot… Sout… 12
2 2000 Divorced 48 White $8000 to 9999 Not str r… Prot… Bapt… NA
3 2000 Widowed 67 White Not applicable Independe… Prot… No d… 2
4 2000 Never married 39 White Not applicable Ind,near … Orth… Not … 4
5 2000 Divorced 25 White Not applicable Not str d… None Not … 1
6 2000 Married 25 White $20000 - 24999 Strong de… Prot… Sout… NA
7 2000 Never married 36 White $25000 or more Not str r… Chri… Not … 3
8 2000 Divorced 44 White $7000 to 7999 Ind,near … Prot… Luth… NA
9 2000 Married 44 White $25000 or more Not str d… Prot… Other 0
10 2000 Married 47 White $25000 or more Strong re… Prot… Sout… 3
# ℹ 21,473 more rows
Reorder the levels
Recode the levels
Collapse levels
Lump levels
How could we improve the partyid
labels?
fct_recode()
gss_cat %>%
drop_na(tvhours) %>%
select(partyid, tvhours) %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat")) %>%
group_by(partyid) %>%
summarize(tvhours = mean(tvhours)) %>%
ggplot(aes(tvhours, fct_reorder(partyid, tvhours))) +
geom_point() +
labs(y = "partyid")
fct_collapse()
gss_cat %>%
drop_na(tvhours) %>%
select(partyid, tvhours) %>%
mutate(
partyid =
fct_collapse(
partyid,
conservative = c("Strong republican",
"Not str republican",
"Ind,near rep"),
liberal = c("Strong democrat",
"Not str democrat",
"Ind,near dem"))
) %>%
group_by(partyid) %>%
summarize(tvhours = mean(tvhours)) %>%
ggplot(aes(tvhours, fct_reorder(partyid, tvhours))) +
geom_point() +
labs(y = "partyid")
fct_lump()
To enhance your data analysis, you can use the following factor manipulation techniques:
10:00