Dplyr

We are gonna go through filter(), arrange(), select(), mutate(), summarise(), group_by(), slice(), rename(), transmute(), sample_n(), sample_frac()

dplyr:

  • first argument is a data frame
  • function
  • result is a new data frame

    Pipe

    %>% is pronounced THEN

All-in-one Example

manager <- c(1, 2, 3, 4, 5) 
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09") 
country <- c("US", "US", "UK", "UK", "UK") 
gender <- c("M", "F", "F", "M", "F") 
age <- c(32, 45, 25, 39, 99) 
q1 <- c(5, 3, 3, 0, 2) 
q2 <- c(4, 5, 0, 3, 2) 
q3 <- c(5, 2, 5, 4, 1) 
q4 <- c(5, 0, 5, NA, 2) 
q5 <- c(5, 5, 2, NA, 1) 
leadership <- data.frame(manager, date, country, gender, age, 
                         q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
> leadership
leadership <- leadership %>%
	mutate()

  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
5       5   5/1/09      UK      F  99  2  2  1  2  1
library(dplyr) #A 
# add total_score and mean_score columns
leadership <- mutate(leadership, #B 
                     total_score = q1 + q2 + q3 + q4 + q5, #B 
                     mean_score = total_score / 5) #B 
> leadership
  manager     date country gender age q1 q2 q3 q4 q5 total_score mean_score
1       1 10/24/08      US      M  32  5  4  5  5  5          24        4.8
2       2 10/28/08      US      F  45  3  5  2  5  5          20        4.0
3       3  10/1/08      UK      F  25  3  5  5  5  2          20        4.0
4       4 10/12/08      UK      M  39  3  3  4 NA NA          NA         NA
5       5   5/1/09      UK      F  99  2  2  1  2  1           8        1.6
# recode or change item in variable gender to M::male, F::female
# this is similar to search and replace
leadership$gender <- recode(leadership$gender, #C 
                            "M" = "male", "F" = "female") #C 
> leadership
  manager     date country gender age q1 q2 q3 q4 q5 total_score mean_score
1       1 10/24/08      US   male  32  5  4  5  5  5          24        4.8
2       2 10/28/08      US female  45  3  5  2  5  5          20        4.0
3       3  10/1/08      UK female  25  3  5  5  5  2          20        4.0
4       4 10/12/08      UK   male  39  3  3  4 NA NA          NA         NA
5       5   5/1/09      UK female  99  2  2  1  2  1           8        1.6														
# recode is similar to renaming the row, and rename is for columns
# right to left
leadership <- rename(leadership, ID = "manager", sex = "gender")#D
> leadership
  ID     date country    sex age q1 q2 q3 q4 q5 total_score mean_score
1  1 10/24/08      US   male  32  5  4  5  5  5          24        4.8
2  2 10/28/08      US female  45  3  5  2  5  5          20        4.0
3  3  10/1/08      UK female  25  3  5  5  5  2          20        4.0
4  4 10/12/08      UK   male  39  3  3  4 NA NA          NA         NA
5  5   5/1/09      UK female  99  2  2  1  2  1           8        1.6

leadership <- arrange(leadership, sex, total_score) #E 
> leadership
  ID     date country    sex age q1 q2 q3 q4 q5 total_score mean_score
1  5   5/1/09      UK female  99  2  2  1  2  1           8        1.6
2  2 10/28/08      US female  45  3  5  2  5  5          20        4.0
3  3  10/1/08      UK female  25  3  5  5  5  2          20        4.0
4  1 10/24/08      US   male  32  5  4  5  5  5          24        4.8
5  4 10/12/08      UK   male  39  3  3  4 NA NA          NA         NA
> leadership <- arrange(leadership, sex, desc(total_score)) #E
> leadership
  ID     date country    sex age q1 q2 q3 q4 q5 total_score mean_score
1  2 10/28/08      US female  45  3  5  2  5  5          20        4.0
2  3  10/1/08      UK female  25  3  5  5  5  2          20        4.0
3  5   5/1/09      UK female  99  2  2  1  2  1           8        1.6
4  1 10/24/08      US   male  32  5  4  5  5  5          24        4.8
5  4 10/12/08      UK   male  39  3  3  4 NA NA          NA         NA

> leadership_ratings <- select(leadership, ID, mean_score) #F
> leadership_ratings
  ID mean_score
1  2        4.0
2  3        4.0
3  5        1.6
4  1        4.8
5  4         NA
leadership_men_high <- filter(leadership, #G
                              sex == "male" & total_score > 10) #G 
> leadership_men_high <- filter(leadership, #G
+                               sex == "male" & total_score > 10) #G
> leadership_men_high
  ID     date country  sex age q1 q2 q3 q4 q5 total_score mean_score
1  1 10/24/08      US male  32  5  4  5  5  5          24        4.8															

filter()

library('nycflights13')
library('dplyr')
filter(flights, month == 1, day == 1, dep_delay >= 15)

flights[flights$month == 1 & flights$day == 1 & flights$dep_delay >= 15, ]

# this is similar 
subset(flights, month == 1 & day == 1 & dep_delay >= 15)
# better 

filter(), with %in%, select specific values in a col

filter(flights, month %in% c(6,7,8))

slice()

slice() will cut by user input

slice(flights, c(1:3, 5200, 9000:9100))

arrange(): order data

arrange(flights, year, month, day)

arrange in descending order

arrange(flights, desc(dep_delay))

sorting missing value

df <- tibble(x = c(5, 2, NA))
arrange(df, x)

select()

# select columns by name
select(flights, year, month, day)

head(dplyr::select(flights, 'year', 'month', 'day'),5)

# Select all columns between year and day (inclusive)
head(dplyr::select(flights, year:day),2)

# the reason, dplyr::select is used because it may clash with other packages

# Select all columns except those from year to day (inclusive)
head(dplyr::select(flights, -(year:day)),3)

If the column names are stored quoted in a variable, they should be passed to the one_of argument.


theCols <- c('flights', 'year', 'month','day')
select(flights, one_of(theCols))

We can also use everything() helper with select to move the selected columns to
the beginning of a dataframe Ex: Flights has 19 columns from year, month, day … time_hour. If we want to move
time_hour to the first column we can use select and everything()

select(flights, time_hour, day, month, year, everything())
# A tibble: 336,776 x 19
   time_hour             day month  year dep_time sched_dep_time dep_delay
   <dttm>              <int> <int> <int>    <int>          <int>     <dbl>
 1 2013-01-01 05:00:00     1     1  2013      517            515         2
 2 2013-01-01 05:00:00     1     1  2013      533            529         4
 3 2013-01-01 05:00:00     1     1  2013      542            540         2
 4 2013-01-01 05:00:00     1     1  2013      544            545        -1

Rename a variable (column)

select(df, something, name_to_be_change = current_name)
Lets change tailnum to tail_num

select(flights, flight, tail_num = tailnum)
   flight tail_num
    <int> <chr>
 1   1545 N14228
 2   1714 N24211
 3   1141 N619AA
 4    725 N804JB
 5    461 N668DN

Or if you want to keep every variable, you can use rename() function rename(df, name_to_be_change = current_name)

rename(flights, tail_num = tailnum)
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850

Select with other helper functions

  • starts_with(“abc”): matches names that begin with “abc”.
  • ends_with(“xyz”): matches names that end with “xyz”.
  • contains(“ijk”): matches names that contain “ijk”.
  • matches(“(.)\1”): selects variables that match a regular expression. This one matches any variables that contain repeated characters.
  • num_range(“x”, 1:3) matches x1, x2 and x3.
select(flights, contains(arr))
   arr_time sched_arr_time arr_delay carrier
      <int>          <int>     <dbl> <chr>
 1      830            819        11 UA
 2      850            830        20 UA
 3      923            850        33 AA

Group_by()

  • .by_group = TRUE for arrange()
  • sample_n(), sample_frac()

In the example below, we group dest to 105 group
Then, we use

> by_dest <- group_by(flights, dest)
> by_dest
# A tibble: 336,776 x 19
# Groups:   dest [105]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 
> delay <- summarise(by_dest,
+ 	count = n(),
+ 	dist = mean(distance, na.rm = TRUE),
+ 	delay = mean(arr_delay, na.rm = TRUE))
> delay
# A tibble: 105 x 4
   dest  count  dist delay
   <chr> <int> <dbl> <dbl>
 1 ABQ     254 1826   4.38
 2 ACK     265  199   4.85
 3 ALB     439  143  14.4
 4 ANC       8 3370  -2.5
 5 ATL   17215  757. 11.3
 6 AUS    2439 1514.  6.02
 7 AVL     275  584.  8.00
 
# Subset the data to only include frequently flown planes
# and distances < 3000
delay <- filter(delay, count > 20, dist < 3000)
delay
   dest  count  dist delay
   <chr> <int> <dbl> <dbl>
 1 ABQ     254 1826   4.38
 2 ACK     265  199   4.85
 3 ALB     439  143  14.4
 4 ATL   17215  757. 11.3
 5 AUS    2439 1514.  6.02

Other useful functions for R

  • n(): number of items in a group (observations)
  • n_distinct(x): find distinct value in x
  • first(x) = x[1]
  • last(x) = x[n] = x[length(x)]
> destinations <- group_by(flights, dest)
> destinations
# Groups:   dest [105]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 
> summarise(destinations,
+ 	planes = n_distinct(tailnum),
+ 	flights = n() # this will count how many thing in a group
+ )
   dest  planes flights
   <chr>  <int>   <int>
 1 ABQ      108     254
 2 ACK       58     265
 3 ALB      172     439
 4 ANC        6       8
 5 ATL     1180   17215
 6 AUS      993    2439
 7 AVL      159     275
 8 BDL      186     443
 9 BGR       46     375
10 BHM       45     297

# we can verify by using 
count(filter(destinations, dest == "AUS"))
# Groups:   dest [1]
  dest      n
  <chr> <int>
1 AUS    2439

Merging (base R)

# merge option
flights.df <- merge(flights.df, cor, by.x=c("origin"), by.y=c("faa"))
flights.df <- merge(flights.df, cor, by.x=c("dest"), by.y=c("faa"),suffixes=c(".origin",".dest"))

remove col/s in R

N0EGMQ         
#find value in a col
flights %>% filter_all(any_vars(. %in% c('N0EGMQ', 'N10156')))




# sum them up by year


# this is NA: N267AT
planes %>% filter_all(any_vars(. %in% c('N267AT')))

# find values
flights %>% filter_all(any_vars(. %in% c('N0EGMQ', 'N10156')))
# because we just want to know the relation between 

#

planes_delay %>% data.frame(row.names = planes_delay$tailnum)

cor(select(planes_delay,!(arr_delay)), select(planes_delay,!(year)))

cor(arrange(select(planes,c(tailnum,year)),tailnum), arrange(select(flights,c(tailnum,arr_delay)),tailnum))

planes.delay <- planes.delay %>%   filter(across(everything(),~ !is.na(.)))

Pull all of the departure-related columns

select(flights, contains(“dep”))

Pull all of the arrival and departure related columns

select(flights, contains(“dep”), contains(“arr”))

# Correlation matrix from mtcars
# with mpg, cyl, and disp as rows
# and hp, drat, and wt as columns
x <- mtcars[1:3]
y <- mtcars[4:6]
cor(x, y)
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))

head(delay,3)

inner_join

Joining three tables with the same variable

sets
   set_num   name                                                  year theme_id
   <chr>     <chr>                                                <dbl>    <dbl>
 1 700.3-1   Medium Gift Set (ABB)                                 1949      365
 2 700.1.1-1 Single 2 x 4 Brick (ABB)                              1950      371
 3 700.B.2-1 Single 1 x 2 x 3 Window without Glass (ABB)           1950      371
 
inventories
      id version set_num 
   <dbl>   <dbl> <chr>   
 1     1       1 7922-1  

inventory_parts
   inventory_id part_num             color_id quantity
          <dbl> <chr>                   <dbl>    <dbl>
 1           21 3009                        7       50
 2           25 21019c00pat004pr1033       15        1
 3           25 24629pr0002                78        1
 4           25 24634pr0001                 5        1
 
sets %>%
	inner_join(inventories, by = "set_num") %>%
	inner_join(inventory_parts, by = c("id" = "inventory_id"))

sets
   set_num name           year theme_id    id version part_num color_id quantity
   <chr>   <chr>         <dbl>    <dbl> <dbl>   <dbl> <chr>       <dbl>    <dbl>
 1 700.3-1 Medium Gift ~  1949      365 24197       1 bdoor01         2        2
 2 700.3-1 Medium Gift ~  1949      365 24197       1 bdoor01        15        1
 3 700.3-1 Medium Gift ~  1949      365 24197       1 bdoor01         4        1
 4 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02        15        6

parts has 3 columns part_num, name, and part_cat_id part_categories has id, name when we do parts %>% inner_join(part_categories, by = c("part_cat_id" = "id"))

All parts columns will be the input of inner_join, when we state part_cat_id = id inner_join will take the name on the left to be the name of final column

parts
# A tibble: 17,501 x 3
   part_num   name                                                      part_cat_id
   <chr>      <chr>                                                           <dbl>
 1 0901       Baseplate 16 x 30 with Set 080 Yellow House Print                   1
 2 0902       Baseplate 16 x 24 with Set 080 Small White House Print              1
 3 0903       Baseplate 16 x 24 with Set 080 Red House Print                      1
# ... with 17,491 more rows
part_categories
# A tibble: 64 x 2
      id name                   
   <dbl> <chr>                  
 1     1 Baseplates             
 2     3 Bricks Sloped          
 3     4 Duplo, Quatro and Primo
 4     5 Bricks Special            
# ... with 54 more rows
parts %>% 
inner_join(part_categories, by = c("part_cat_id" = "id"))
# A tibble: 17,501 x 4
   part_num   name.x                                  part_cat_id name.y        
   <chr>      <chr>                                         <dbl> <chr>         
 1 0901       Baseplate 16 x 30 with Set 080 Yellow ~           1 Baseplates    
 2 0902       Baseplate 16 x 24 with Set 080 Small W~           1 Baseplates    
 3 0903       Baseplate 16 x 24 with Set 080 Red Hou~           1 Baseplates    

left_join

In this example we merge star_destroyer to millennium_falcon by both
color_id and part_num

millennium_falcon
# A tibble: 263 x 4
   set_num part_num color_id quantity
   <chr>   <chr>       <dbl>    <dbl>
 1 7965-1  63868          71       62
 2 7965-1  3023            0       60
 3 7965-1  3021           72       46

star_destroyer
# A tibble: 293 x 4
   set_num part_num color_id quantity
   <chr>   <chr>       <dbl>    <dbl>
 1 75190-1 6141           72       66
 2 75190-1 3020            0       38
 3 75190-1 2780            0       36
 
millennium_falcon %>%
left_join(star_destroyer, by = c("color_id", "part_num"), suffix = c("_falcon", "_star_destroyer"))

# A tibble: 263 x 6
   set_num_falcon part_num color_id quantity_falcon set_num_star_destroyer
   <chr>          <chr>       <dbl>           <dbl> <chr>                 
 1 7965-1         63868          71              62 NA                    
 2 7965-1         3023            0              60 NA                    
 3 7965-1         3021           72              46 75190-1  
 

group_by() and summarise()

In this example we will count how many thing in a group

# Aggregate Millennium Falcon for the total quantity in each part
millennium_falcon

# A tibble: 263 x 4
   set_num part_num color_id quantity
   <chr>   <chr>       <dbl>    <dbl>
 1 7965-1  63868          71       62
 2 7965-1  3023            0       60
# ... with 253 more rows
millennium_falcon_colors <- millennium_falcon %>%
  group_by(color_id) %>%
  summarize(total_quantity = sum(quantity))
	
millennium_falcon_colors 

# A tibble: 21 x 2
   color_id total_quantity
      <dbl>          <dbl>
 1        0            201
 2        1             15
 3        4             17
 4       14              3
# ... with 11 more rows

left_join and filter to find na value

Na is important for statistic because sometime our function will not work if we dont treat or ignore NA values

inventory_version_1 <- inventories %>%
  filter(version == 1)
head(inventory_version_1,3)
# A tibble: 3 x 3
     id version set_num
  <dbl>   <dbl> <chr>  
1     1       1 7922-1 
2     3       1 3931-1 
3     4       1 6942-1 
head(sets,3)
# A tibble: 3 x 4
  set_num   name                                         year theme_id
  <chr>     <chr>                                       <dbl>    <dbl>
1 700.3-1   Medium Gift Set (ABB)                        1949      365
2 700.1.1-1 Single 2 x 4 Brick (ABB)                     1950      371
3 700.B.2-1 Single 1 x 2 x 3 Window without Glass (ABB)  1950      371
# Join versions to sets
sets %>%
  left_join(inventory_version_1, by = 'set_num') %>%
  # Filter for where version is na
  filter(is.na(version))
# A tibble: 1 x 6
  set_num name       year theme_id    id version
  <chr>   <chr>     <dbl>    <dbl> <dbl>   <dbl>
1 40198-1 Ludo game  2018      598    NA      NA

right_join

This is similar to left join but it will merge into table on the right side

count()

count(x, …, wt = NULL, sort = FALSE, name = NULL)

  • wt stands for weight

In #A, it will count how many occurrence for each carrier in a dataset
#B, it will count how many flights for each carrier

# using lego sets
# we can download and read_csv function
count(flights, carrier) 													#A
# A tibble: 16 x 2
   carrier     n
   <chr>   <int>
 1 9E      18460
 2 AA      32729
 3 AS        714
 4 B6      54635
 5 DL      48110
 6 EV      54173
 
count(flights, carrier, wt = flight, sort = TRUE) #B
# A tibble: 16 x 2
   carrier         n
   <chr>       <int>
 1 EV      250500399
 2 MQ      101761406
 3 DL       66141518
 4 9E       65847885
 5 UA       56289091
 6 B6       36669259

joining their children

We can represent hirachy in table format by using number
example:

Car- Honda
|- Toyota
|- Ford - Bronco
|- F150

			id name           parent_id
   <dbl> <chr>              <dbl>
 1     1 Car           		    NA
 2     2 Honda								1
 3     3 Toyota								1
 4     4 Ford									1
 5     5 Bronco								4
 6     6 F150									4

nycflights13

> flights
# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute time_hour          
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl> <dttm>             
 1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15 2013-01-01 05:00:00
 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29 2013-01-01 05:00:00
 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40 2013-01-01 05:00:00
 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45 2013-01-01 05:00:00
 
 
> nycflights13::weather
# A tibble: 26,115 x 15
   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour          
   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>  <dbl>    <dbl> <dbl> <dttm>             
 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA      0    1012     10 2013-01-01 01:00:00
 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA      0    1012.    10 2013-01-01 02:00:00
 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA      0    1012.    10 2013-01-01 03:00:00