TidyR Demo

pipe operator

# in base R, we typically have to assign an object for each function 
# for example, if we only wanted carrier and airtime and then wanted to make cairtime in terms of hours, we would have to do this:
(new_df <- df[, c("carrier", "air_time")])

## # A tibble: 336,776 × 2
##    carrier air_time
##    <chr>      <dbl>
##  1 UA           227
##  2 UA           227
##  3 AA           160
##  4 B6           183
##  5 DL           116
##  6 UA           150
##  7 B6           158
##  8 EV            53
##  9 B6           140
## 10 AA           138
## # … with 336,766 more rows

new_df$air_time_hr <- new_df$air_time / 60
new_df

## # A tibble: 336,776 × 3
##    carrier air_time air_time_hr
##    <chr>      <dbl>       <dbl>
##  1 UA           227       3.78 
##  2 UA           227       3.78 
##  3 AA           160       2.67 
##  4 B6           183       3.05 
##  5 DL           116       1.93 
##  6 UA           150       2.5  
##  7 B6           158       2.63 
##  8 EV            53       0.883
##  9 B6           140       2.33 
## 10 AA           138       2.3  
## # … with 336,766 more rows

# with the pipe operator, we can easily combine those two operations into connected lines of code with no intermediate object needed
df %>%
  select(carrier, air_time) %>%
  mutate(air_time_hr = air_time / 60)

## # A tibble: 336,776 × 3
##    carrier air_time air_time_hr
##    <chr>      <dbl>       <dbl>
##  1 UA           227       3.78 
##  2 UA           227       3.78 
##  3 AA           160       2.67 
##  4 B6           183       3.05 
##  5 DL           116       1.93 
##  6 UA           150       2.5  
##  7 B6           158       2.63 
##  8 EV            53       0.883
##  9 B6           140       2.33 
## 10 AA           138       2.3  
## # … with 336,766 more rows

# if we just want to see the result, this is fine, but if we want to save it as a new dataframe we need to assign it to something else 
# either option below works

df %>%
  select(carrier, air_time) %>%
  mutate(air_time_hr = air_time / 60) -> new_df

new_df <- df %>%
  select(carrier, air_time) %>%
  mutate(air_time_hr = air_time / 60)

filter

# say we only want the flights who had a departure delay (dep_delay > 0)
df %>%
  filter(dep_delay > 0)

## # A tibble: 128,432 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      601            600         1      844            850
##  5  2013     1     1      608            600         8      807            735
##  6  2013     1     1      611            600        11      945            931
##  7  2013     1     1      613            610         3      925            921
##  8  2013     1     1      623            610        13      920            915
##  9  2013     1     1      632            608        24      740            728
## 10  2013     1     1      644            636         8      931            940
## # … with 128,422 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

# if it's just one line of code we can write it like this 
filter(df, dep_delay > 0)

## # A tibble: 128,432 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      601            600         1      844            850
##  5  2013     1     1      608            600         8      807            735
##  6  2013     1     1      611            600        11      945            931
##  7  2013     1     1      613            610         3      925            921
##  8  2013     1     1      623            610        13      920            915
##  9  2013     1     1      632            608        24      740            728
## 10  2013     1     1      644            636         8      931            940
## # … with 128,422 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

# but we can't do it like this 
df %>%
  filter(df, dep_delay > 0)

select

# select multiple columns
df %>%
  select(carrier, air_time)

## # A tibble: 336,776 × 2
##    carrier air_time
##    <chr>      <dbl>
##  1 UA           227
##  2 UA           227
##  3 AA           160
##  4 B6           183
##  5 DL           116
##  6 UA           150
##  7 B6           158
##  8 EV            53
##  9 B6           140
## 10 AA           138
## # … with 336,766 more rows

# by column number (not recommended)
df %>%
  select(1:3)

## # A tibble: 336,776 × 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # … with 336,766 more rows

# 'deselect' columns 
df %>%
  select(-carrier)

## # A tibble: 336,776 × 18
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 10 more variables: arr_delay <dbl>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

mutate

# make a new column 
df %>%
  mutate(miles_per_min = distance/air_time)

## # A tibble: 336,776 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   miles_per_min <dbl>

# write over existing 
df %>%
  mutate(origin = "NYC")

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

group by / summarise

# say we want to see how many flights per day 
# for multiple groups, order matters 
# when you summarise, you aggregate over the last group (takes off one group) - notice the groups box at the tops of the output 

(df %>%
  group_by(year, month, day) %>%
  summarise(daily = n()) -> daily)

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day daily
##    <int> <int> <int> <int>
##  1  2013     1     1   842
##  2  2013     1     2   943
##  3  2013     1     3   914
##  4  2013     1     4   915
##  5  2013     1     5   720
##  6  2013     1     6   832
##  7  2013     1     7   933
##  8  2013     1     8   899
##  9  2013     1     9   902
## 10  2013     1    10   932
## # … with 355 more rows

(daily %>%
  summarise(monthy = n()) -> monthly)

## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

## # A tibble: 12 × 3
## # Groups:   year [1]
##     year month monthy
##    <int> <int>  <int>
##  1  2013     1     31
##  2  2013     2     28
##  3  2013     3     31
##  4  2013     4     30
##  5  2013     5     31
##  6  2013     6     30
##  7  2013     7     31
##  8  2013     8     31
##  9  2013     9     30
## 10  2013    10     31
## 11  2013    11     30
## 12  2013    12     31

(monthly %>%
  summarise(annual = n()))

## # A tibble: 1 × 2
##    year annual
##   <int>  <int>
## 1  2013     12

pivoting

# making small test df to work with
(df %>%
  select(carrier, flight, air_time) %>%
  filter(carrier == "UA", flight %in% c(1545, 1714,1696)) %>%
   slice(1:3) -> new_df)

## # A tibble: 3 × 3
##   carrier flight air_time
##   <chr>    <int>    <dbl>
## 1 UA        1545      227
## 2 UA        1714      227
## 3 UA        1696      150

(wide_df <- pivot_wider(new_df, names_from = flight, values_from = air_time))

## # A tibble: 1 × 4
##   carrier `1545` `1714` `1696`
##   <chr>    <dbl>  <dbl>  <dbl>
## 1 UA         227    227    150

(pivot_longer(wide_df, cols = c(`1545`, `1714`, `1696`), names_to = 'flight_num', values_to = 'time_in_air'))

## # A tibble: 3 × 3
##   carrier flight_num time_in_air
##   <chr>   <chr>            <dbl>
## 1 UA      1545               227
## 2 UA      1714               227
## 3 UA      1696               150