# A tibble: 6 × 3
year name n
<dbl> <chr> <int>
1 1880 Mary 7065
2 1880 Anna 2604
3 1880 Emma 2003
4 1880 Elizabeth 1939
5 1880 Minnie 1746
6 1880 Margaret 1578
STAT 220
The process of cleaning and unifying messy and complex data sets for easy access and analysis
Most of the operations on a data table can be achieved with
Operator | Definition |
---|---|
< |
less than |
<= |
less than or equal to |
> |
greater than |
>= |
greater than or equal to |
== |
exactly equal to |
!= |
not equal to |
x & y |
x AND y |
x %in% y |
test if x is in y |
select()
: take a subset of the columns (variables/features)
%>%
passes result on left into first argument of function on rightChaining
functions together lets you read Left-to-right
, top-to-bottom
# A tibble: 6 × 3
year name n
<dbl> <chr> <int>
1 1880 Mary 7065
2 1880 Anna 2604
3 1880 Emma 2003
4 1880 Elizabeth 1939
5 1880 Minnie 1746
6 1880 Margaret 1578
filter()
: take a subset of the rows (observations)
# A tibble: 10 × 4
year name sex n
<dbl> <chr> <chr> <int>
1 1880 Bella F 13
2 1881 Bella F 24
3 1882 Bella F 16
4 1883 Bella F 17
5 1884 Bella F 31
6 1885 Bella F 25
7 1886 Bella F 22
8 1887 Bella F 26
9 1888 Bella F 31
10 1889 Bella F 37
# A tibble: 11 × 3
year name n
<dbl> <chr> <int>
1 1995 Mike 332
2 1996 Mike 362
3 1997 Mike 334
4 1998 Mike 289
5 1999 Mike 284
6 2000 Mike 329
7 2001 Mike 315
8 2002 Mike 342
9 2003 Mike 321
10 2004 Mike 338
11 2005 Mike 300
Order rows from smallest to largest
babynames %>%
filter(name == "Mike", sex == "M",
year %in% c(1995:2005)) %>%
select(year, name, n) %>%
arrange(n)
# A tibble: 11 × 3
year name n
<dbl> <chr> <int>
1 1999 Mike 284
2 1998 Mike 289
3 2005 Mike 300
4 2001 Mike 315
5 2003 Mike 321
6 2000 Mike 329
7 1995 Mike 332
8 1997 Mike 334
9 2004 Mike 338
10 2002 Mike 342
11 1996 Mike 362
Changes ordering from largest to smallest
babynames %>%
filter(name == "Mike", sex == "M",
year %in% c(1995:2005)) %>%
select(year, name, n) %>%
arrange(desc(n))
# A tibble: 11 × 3
year name n
<dbl> <chr> <int>
1 1996 Mike 362
2 2002 Mike 342
3 2004 Mike 338
4 1997 Mike 334
5 1995 Mike 332
6 2000 Mike 329
7 2003 Mike 321
8 2001 Mike 315
9 2005 Mike 300
10 1998 Mike 289
11 1999 Mike 284
ca6-yourusername
repository from Github10:00
If we want to compare summary statistics, we might use summarize()
# A tibble: 24 × 5
# Groups: year [12]
year sex name n prop
<dbl> <chr> <chr> <int> <dbl>
1 2006 F Kaden 103 0.0000493
2 2006 M Kaden 4741 0.00216
3 2007 F Kaden 96 0.0000454
4 2007 M Kaden 4520 0.00204
5 2008 F Kaden 71 0.0000341
6 2008 M Kaden 4500 0.00207
7 2009 F Kaden 78 0.0000386
8 2009 M Kaden 4061 0.00192
9 2010 F Kaden 40 0.0000204
10 2010 M Kaden 3590 0.00175
# ℹ 14 more rows
babynames %>%
filter(name == "Kaden", !(year %in% c(1880:2005))) %>%
group_by(year) %>%
summarize(total = sum(n))
# A tibble: 12 × 2
year total
<dbl> <int>
1 2006 4844
2 2007 4616
3 2008 4571
4 2009 4139
5 2010 3630
6 2011 3323
7 2012 2972
8 2013 2662
9 2014 2486
10 2015 2357
11 2016 2039
12 2017 1851
mutate()
lets us create new variables based on manipulations of the old variables
# A tibble: 6 × 6
# Groups: year [1]
year sex name n prop percent
<dbl> <chr> <chr> <int> <dbl> <dbl>
1 1880 F Mary 7065 0.0724 7.24
2 1880 F Anna 2604 0.0267 2.67
3 1880 F Emma 2003 0.0205 2.05
4 1880 F Elizabeth 1939 0.0199 1.99
5 1880 F Minnie 1746 0.0179 1.79
6 1880 F Margaret 1578 0.0162 1.62
mutate()
with lag()
babynames %>%
filter(name == "Aaron", sex == "M") %>%
arrange(year) %>%
mutate(change_in_n = if_else(row_number() == 1,
n, # for the first row
n - lag(n, 1))) # for subsequent rows
# A tibble: 138 × 6
year sex name n prop change_in_n
<dbl> <chr> <chr> <int> <dbl> <int>
1 1880 M Aaron 102 0.000861 102
2 1881 M Aaron 94 0.000868 -8
3 1882 M Aaron 85 0.000697 -9
4 1883 M Aaron 105 0.000934 20
5 1884 M Aaron 97 0.000790 -8
6 1885 M Aaron 88 0.000759 -9
7 1886 M Aaron 86 0.000722 -2
8 1887 M Aaron 78 0.000714 -8
9 1888 M Aaron 90 0.000693 12
10 1889 M Aaron 85 0.000714 -5
# ℹ 128 more rows
The slice_ operators let you slice (subset) rows:
# A tibble: 5 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Kuwait Asia 1957 58.0 212846 113523.
2 Kuwait Asia 1972 67.7 841934 109348.
3 Kuwait Asia 1952 55.6 160000 108382.
4 Kuwait Asia 1962 60.5 358266 95458.
5 Kuwait Asia 1967 64.6 575003 80895.
6 Kuwait Asia 1977 69.3 1140357 59265.
pull()
pull()
is used to extract a single column from a data frame as a vector.
# find the most common name for each year (2000-2010)
babynames %>%
group_by(year) %>%
arrange(desc(n)) %>%
slice_max(order_by = n, n = 1) %>%
ungroup() %>% # to break the group structure
pull(name) %>%
unique() # selects unique names
[1] "John" "Mary" "Robert" "James" "Linda" "Michael"
[7] "David" "Jacob" "Isabella" "Sophia" "Emma"
summarize()
: summarize collapses all variable values down to one number (by group)
mutate()
: transforms all variable values but preserves the variable length (by group)
# A tibble: 1,704 × 7
# Groups: continent [5]
country continent year lifeExp pop gdpPercap AverageLifeExp
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 60.1
2 Afghanistan Asia 1957 30.3 9240934 821. 60.1
3 Afghanistan Asia 1962 32.0 10267083 853. 60.1
4 Afghanistan Asia 1967 34.0 11537966 836. 60.1
5 Afghanistan Asia 1972 36.1 13079460 740. 60.1
6 Afghanistan Asia 1977 38.4 14880372 786. 60.1
7 Afghanistan Asia 1982 39.9 12881816 978. 60.1
8 Afghanistan Asia 1987 40.8 13867957 852. 60.1
9 Afghanistan Asia 1992 41.7 16317921 649. 60.1
10 Afghanistan Asia 1997 41.8 22227415 635. 60.1
# ℹ 1,694 more rows
Any further computations would apply to both grouping variables
gapminder %>%
group_by(continent, year) %>%
summarise(AverageLifeExp = mean(lifeExp)) %>%
slice_max(AverageLifeExp, n = 1)
# A tibble: 5 × 3
# Groups: continent [5]
continent year AverageLifeExp
<fct> <int> <dbl>
1 Africa 2007 54.8
2 Americas 2007 73.6
3 Asia 2007 70.7
4 Europe 2007 77.6
5 Oceania 2007 80.7
Any further mutations called on it would not use the grouping for aggregate statistics
10:00