pipe operator
# in base R, we typically have to assign an object for each function
# for example, if we only wanted carrier and airtime and then wanted to make cairtime in terms of hours, we would have to do this:
(new_df <- df[, c("carrier", "air_time")])
## # A tibble: 336,776 × 2
## carrier air_time
## <chr> <dbl>
## 1 UA 227
## 2 UA 227
## 3 AA 160
## 4 B6 183
## 5 DL 116
## 6 UA 150
## 7 B6 158
## 8 EV 53
## 9 B6 140
## 10 AA 138
## # … with 336,766 more rows
new_df$air_time_hr <- new_df$air_time / 60
new_df
## # A tibble: 336,776 × 3
## carrier air_time air_time_hr
## <chr> <dbl> <dbl>
## 1 UA 227 3.78
## 2 UA 227 3.78
## 3 AA 160 2.67
## 4 B6 183 3.05
## 5 DL 116 1.93
## 6 UA 150 2.5
## 7 B6 158 2.63
## 8 EV 53 0.883
## 9 B6 140 2.33
## 10 AA 138 2.3
## # … with 336,766 more rows
# with the pipe operator, we can easily combine those two operations into connected lines of code with no intermediate object needed
df %>%
select(carrier, air_time) %>%
mutate(air_time_hr = air_time / 60)
## # A tibble: 336,776 × 3
## carrier air_time air_time_hr
## <chr> <dbl> <dbl>
## 1 UA 227 3.78
## 2 UA 227 3.78
## 3 AA 160 2.67
## 4 B6 183 3.05
## 5 DL 116 1.93
## 6 UA 150 2.5
## 7 B6 158 2.63
## 8 EV 53 0.883
## 9 B6 140 2.33
## 10 AA 138 2.3
## # … with 336,766 more rows
# if we just want to see the result, this is fine, but if we want to save it as a new dataframe we need to assign it to something else
# either option below works
df %>%
select(carrier, air_time) %>%
mutate(air_time_hr = air_time / 60) -> new_df
new_df <- df %>%
select(carrier, air_time) %>%
mutate(air_time_hr = air_time / 60)
filter
# say we only want the flights who had a departure delay (dep_delay > 0)
df %>%
filter(dep_delay > 0)
## # A tibble: 128,432 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 601 600 1 844 850
## 5 2013 1 1 608 600 8 807 735
## 6 2013 1 1 611 600 11 945 931
## 7 2013 1 1 613 610 3 925 921
## 8 2013 1 1 623 610 13 920 915
## 9 2013 1 1 632 608 24 740 728
## 10 2013 1 1 644 636 8 931 940
## # … with 128,422 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# if it's just one line of code we can write it like this
filter(df, dep_delay > 0)
## # A tibble: 128,432 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 601 600 1 844 850
## 5 2013 1 1 608 600 8 807 735
## 6 2013 1 1 611 600 11 945 931
## 7 2013 1 1 613 610 3 925 921
## 8 2013 1 1 623 610 13 920 915
## 9 2013 1 1 632 608 24 740 728
## 10 2013 1 1 644 636 8 931 940
## # … with 128,422 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# but we can't do it like this
df %>%
filter(df, dep_delay > 0)
select
# select multiple columns
df %>%
select(carrier, air_time)
## # A tibble: 336,776 × 2
## carrier air_time
## <chr> <dbl>
## 1 UA 227
## 2 UA 227
## 3 AA 160
## 4 B6 183
## 5 DL 116
## 6 UA 150
## 7 B6 158
## 8 EV 53
## 9 B6 140
## 10 AA 138
## # … with 336,766 more rows
# by column number (not recommended)
df %>%
select(1:3)
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # … with 336,766 more rows
# 'deselect' columns
df %>%
select(-carrier)
## # A tibble: 336,776 × 18
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 10 more variables: arr_delay <dbl>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
mutate
# make a new column
df %>%
mutate(miles_per_min = distance/air_time)
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # miles_per_min <dbl>
# write over existing
df %>%
mutate(origin = "NYC")
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
group by / summarise
# say we want to see how many flights per day
# for multiple groups, order matters
# when you summarise, you aggregate over the last group (takes off one group) - notice the groups box at the tops of the output
(df %>%
group_by(year, month, day) %>%
summarise(daily = n()) -> daily)
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day daily
## <int> <int> <int> <int>
## 1 2013 1 1 842
## 2 2013 1 2 943
## 3 2013 1 3 914
## 4 2013 1 4 915
## 5 2013 1 5 720
## 6 2013 1 6 832
## 7 2013 1 7 933
## 8 2013 1 8 899
## 9 2013 1 9 902
## 10 2013 1 10 932
## # … with 355 more rows
(daily %>%
summarise(monthy = n()) -> monthly)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
## # A tibble: 12 × 3
## # Groups: year [1]
## year month monthy
## <int> <int> <int>
## 1 2013 1 31
## 2 2013 2 28
## 3 2013 3 31
## 4 2013 4 30
## 5 2013 5 31
## 6 2013 6 30
## 7 2013 7 31
## 8 2013 8 31
## 9 2013 9 30
## 10 2013 10 31
## 11 2013 11 30
## 12 2013 12 31
(monthly %>%
summarise(annual = n()))
## # A tibble: 1 × 2
## year annual
## <int> <int>
## 1 2013 12
pivoting
# making small test df to work with
(df %>%
select(carrier, flight, air_time) %>%
filter(carrier == "UA", flight %in% c(1545, 1714,1696)) %>%
slice(1:3) -> new_df)
## # A tibble: 3 × 3
## carrier flight air_time
## <chr> <int> <dbl>
## 1 UA 1545 227
## 2 UA 1714 227
## 3 UA 1696 150
(wide_df <- pivot_wider(new_df, names_from = flight, values_from = air_time))
## # A tibble: 1 × 4
## carrier `1545` `1714` `1696`
## <chr> <dbl> <dbl> <dbl>
## 1 UA 227 227 150
(pivot_longer(wide_df, cols = c(`1545`, `1714`, `1696`), names_to = 'flight_num', values_to = 'time_in_air'))
## # A tibble: 3 × 3
## carrier flight_num time_in_air
## <chr> <chr> <dbl>
## 1 UA 1545 227
## 2 UA 1714 227
## 3 UA 1696 150