Search notes:

R package: dplyr

SQL like access

The arguably six most important functions of dplyr provide SQL like access to dataframes. These functions are
The first argument in these functions is always a data frame and these functions return a (new) data frame.
library(dplyr)

df_orig <- data.frame (
    id  = c(    1 ,     2 ,     3  ,     4 ,     5 ,     6 ,      7 ,      8 ,     9 ,     10 ),
    num = c( 'one',  'two', 'three', 'four', 'five',  'six', 'seven', 'eight', 'nine',   'ten'),
    val = c( 8.32 ,  5.41 ,   0.01 ,  7.71 ,  2.18 ,  7.35 ,   8.69 ,   4.72 , 12.68 ,   5.31 )
)


#
#    select
#       id     as ident,
#       num    as num_english,
#       val    as price
#    from
#       df_orig
rename(df_orig, ident = id, num_english = num, price = val)

#
#   create table
#      df_val_between_3_and_10 as
#   select
#      *
#   from
#     df_orig
#   where
#     val >   3 and
#     val <= 10;
#   
#
df_val_between_3_and_10 <- filter(df_orig, val > 3, val <= 10)

#
#    create table
#       df_val_ordered as
#    select
#       *
#    from
#       df_val_between_3_and_10
#    order by
#       val;
#
df_val_ordered <- arrange(df_val_between_3_and_10, val)

#
#    create table
#       df_with_val_squared as
#    select
#       *
#    from
#       df_val_between_3_and_10
#    order by
#       val;
#
df_with_val_squared <- mutate(df_val_ordered, val_squared = val *  2)

#
#    select
#       round(val),
#       min  (val),
#       max  (val),
#       avg  (val_squared)
#    from
#       df_with_val_squared 
#    group by
#       round(val);
#
#
summarise ( 
            group_by(df_with_val_squared, round(val)),
            min(val),
            max(val),
            mean(val_squared)
          )
Github repository about-r, path: /packages/dplyr/sql-like.r

Count a categorical value

A data frame is piped (%>%) into the dplyr function count to count the occurences of each value (that is: the levels) of the categorical variable (aka factor):
library(dplyr)

df <- data.frame (
  item = c('foo', 'bar', 'foo', 'foo', 'bar', 'foo', 'baz', 'bar'),
  val  = c(   4 ,    7 ,    3 ,    3 ,    5 ,    6 ,    5 ,    4 )
)

df %>% count(item)
#   
#   # A tibble: 3 x 2
#     item      n
#     <fct> <int>
#   1 bar       3
#   2 baz       1
#   3 foo       4
#
Github repository about-r, path: /packages/dplyr/count-categorical-value.R

Count a continuous variable

Using the cut_width() function of ggplot2, a continuous variable can be cut into equally sized widths and then be counted:
library(dplyr)
library(ggplot2)

df <- read.table (
  header = TRUE,
  text   = 'item val
foo  17
bar  39
foo  22
bar  18
foo  31
baz  24
foo  24
bar  19
baz  17
foo  21
bar  31
baz  22
bar  11
foo  23'
)

df %>% count(cut_width(val, 5))
#
#   # A tibble: 6 x 2
#     `cut_width(val, 5)`     n
#     <fct>               <int>
#   1 [7.5,12.5]              1
#   2 (12.5,17.5]             2
#   3 (17.5,22.5]             5
#   4 (22.5,27.5]             3
#   5 (27.5,32.5]             2
#   6 (37.5,42.5]             1
#
Github repository about-r, path: /packages/dplyr/count-continuous-value.R

Assign unique number to rows

mutate() in combination with row_number() allows to assign a unique number (id) to each row in a data set. In the following example, the new column is named id.
library(dplyr)

df <- data.frame(
  col_1 = c('foo', 'bar', 'foo', 'baz'),
  col_2 = c( 7.2 ,  1.8 ,  2.0 ,  4.1 )
)

df                          %>%
  mutate(id = row_number()) %>%  # Assign (unique) id with row_number
  select(id, col_1, col_2 )      # Re-order columns
Github repository about-r, path: /packages/dplyr/assign-id-with-rownumber.R

filter

As there is a between operator in SQL, there is also a between in dplyr:
library(dplyr)

df <- data.frame(
  col_1 = c('one', 'two', 'three', 'four', 'five'),
  col_2 = c( 7.2 ,  4.1 ,    1.8 ,   2.0 ,   6.2 )
)

filter(df,
  between(col_2, 2, 7)
)
Github repository about-r, path: /packages/dplyr/filter/between.R

near()

near() is a handy function that checks if two numbers are very close to each other. The function returns TRUE where the == operator would yield false because of finite precision arithmetic.
library(dplyr)

sqrt(7) ^ 2   == 7
# FALSE

1 / 49  * 49  == 1
# FALSE

near( sqrt(7)^2 , 7)
# TRUE

near(1 / 49 * 49, 1)
# TRUE
Github repository about-r, path: /packages/dplyr/near.R

case when

The function case_when mimics the SQL statement case when.
library(dplyr)

values <- c(9.1, 3.4, 5.6, 2.5, 7.9);

texts <- case_when(
          values <  0  ~   '<0' ,
          values <  3  ~   '<3' ,
          values <  5  ~  '3-5' ,
          values <  8  ~  '5-8' ,
          values < 10  ~ '8-10' ,
          TRUE         ~   '>10'
);

print(texts);
#
#  "8-10" "3-5"  "5-8"  "<3"   "5-8"
Github repository about-r, path: /packages/dplyr/case_when.R
See also the switch statement combined with findInterval(…).

show_query

If dplyr is used to access databases via DBI rather than data frames, dplyr will dymaically create SQL statements.
In such cases, show_query(…) can then be used to display the generated SQL statement:
library(dplyr  )
library(RSQLite)

sqlLite <- dbConnect(RSQLite::SQLite(), ':memory:');

# df <- data.frame(
#    col_one   = c('foo', 'bar', 'baz'  ,'foo' , 'foo' , 'baz', 'bar'  , 'bar'  ),
#    col_two   = c('one', 'two', 'three','four', 'five', 'six', 'seven', 'eight'),
#    col_three = c( 1   ,  2   ,  3     , 4    ,  5    ,  6   ,  7     ,  8     )
# );

dbWriteTable(sqlLite, 'tab', data.frame(
   col_one   = c('foo', 'bar', 'baz'  ,'foo' , 'foo' , 'baz', 'bar'  , 'bar'  ),
   col_two   = c('one', 'two', 'three','four', 'five', 'six', 'seven', 'eight'),
   col_three = c( 1   ,  2   ,  3     , 4    ,  5    ,  6   ,  7     ,  8     )
));

ref_tibble <- tbl(sqlLite, 'tab');

query <- ref_tibble                            %>%
         filter    (col_three %% 3 > 0      )  %>%
         group_by  (col_one                 )  %>%
         summarise (min_two = min(col_two),
                    max_two = max(col_two)  )

show_query(query);
#
#   <SQL>
#   SELECT `col_one`, MIN(`col_two`) AS `min_two`, MAX(`col_two`) AS `max_two`
#   FROM `tab`
#   WHERE (`col_three` % 3.0 > 0.0)
#   GROUP BY `col_one`


dbDisconnect(sqlLite);
Github repository about-r, path: /packages/dplyr/sql/show_query.R

See also

dplyr is part of the tidyverse.
R packages

Index