R package: dplyr

SQL like access

The arguably six most important functions of dplyr provide SQL like access to dataframes. These functions are

filter() to select observations by their values
arrange() to reorder rows
mutate() to create new variables with functions of existing values
summarise() to create summaries of values
select() to select variables by their names
group_by()

The first argument in these functions is always a data frame and these functions return a (new) data frame.

library(dplyr)

df_orig <- data.frame (
    id  = c(    1 ,     2 ,     3  ,     4 ,     5 ,     6 ,      7 ,      8 ,     9 ,     10 ),
    num = c( 'one',  'two', 'three', 'four', 'five',  'six', 'seven', 'eight', 'nine',   'ten'),
    val = c( 8.32 ,  5.41 ,   0.01 ,  7.71 ,  2.18 ,  7.35 ,   8.69 ,   4.72 , 12.68 ,   5.31 )
)


#
#    select
#       id     as ident,
#       num    as num_english,
#       val    as price
#    from
#       df_orig
rename(df_orig, ident = id, num_english = num, price = val)

#
#   create table
#      df_val_between_3_and_10 as
#   select
#      *
#   from
#     df_orig
#   where
#     val >   3 and
#     val <= 10;
#   
#
df_val_between_3_and_10 <- filter(df_orig, val > 3, val <= 10)

#
#    create table
#       df_val_ordered as
#    select
#       *
#    from
#       df_val_between_3_and_10
#    order by
#       val;
#
df_val_ordered <- arrange(df_val_between_3_and_10, val)

#
#    create table
#       df_with_val_squared as
#    select
#       *
#    from
#       df_val_between_3_and_10
#    order by
#       val;
#
df_with_val_squared <- mutate(df_val_ordered, val_squared = val *  2)

#
#    select
#       round(val),
#       min  (val),
#       max  (val),
#       avg  (val_squared)
#    from
#       df_with_val_squared 
#    group by
#       round(val);
#
#
summarise ( 
            group_by(df_with_val_squared, round(val)),
            min(val),
            max(val),
            mean(val_squared)
          )

Github repository about-r, path: /packages/dplyr/sql-like.r

Count a categorical value

A data frame is piped (%>%) into the dplyr function count to count the occurences of each value (that is: the levels) of the categorical variable (aka factor):

library(dplyr)

df <- data.frame (
  item = c('foo', 'bar', 'foo', 'foo', 'bar', 'foo', 'baz', 'bar'),
  val  = c(   4 ,    7 ,    3 ,    3 ,    5 ,    6 ,    5 ,    4 )
)

df %>% count(item)
#   
#   # A tibble: 3 x 2
#     item      n
#     <fct> <int>
#   1 bar       3
#   2 baz       1
#   3 foo       4
#

Github repository about-r, path: /packages/dplyr/count-categorical-value.R

Count a continuous variable

Using the cut_width() function of ggplot2, a continuous variable can be cut into equally sized widths and then be counted:

library(dplyr)
library(ggplot2)

df <- read.table (
  header = TRUE,
  text   = 'item val
foo  17
bar  39
foo  22
bar  18
foo  31
baz  24
foo  24
bar  19
baz  17
foo  21
bar  31
baz  22
bar  11
foo  23'
)

df %>% count(cut_width(val, 5))
#
#   # A tibble: 6 x 2
#     `cut_width(val, 5)`     n
#     <fct>               <int>
#   1 [7.5,12.5]              1
#   2 (12.5,17.5]             2
#   3 (17.5,22.5]             5
#   4 (22.5,27.5]             3
#   5 (27.5,32.5]             2
#   6 (37.5,42.5]             1
#

Github repository about-r, path: /packages/dplyr/count-continuous-value.R

Assign unique number to rows

mutate() in combination with row_number() allows to assign a unique number (id) to each row in a data set. In the following example, the new column is named id.

library(dplyr)

df <- data.frame(
  col_1 = c('foo', 'bar', 'foo', 'baz'),
  col_2 = c( 7.2 ,  1.8 ,  2.0 ,  4.1 )
)

df                          %>%
  mutate(id = row_number()) %>%  # Assign (unique) id with row_number
  select(id, col_1, col_2 )      # Re-order columns

Github repository about-r, path: /packages/dplyr/assign-id-with-rownumber.R

filter

As there is a between operator in SQL, there is also a between in dplyr:

library(dplyr)

df <- data.frame(
  col_1 = c('one', 'two', 'three', 'four', 'five'),
  col_2 = c( 7.2 ,  4.1 ,    1.8 ,   2.0 ,   6.2 )
)

filter(df,
  between(col_2, 2, 7)
)

Github repository about-r, path: /packages/dplyr/filter/between.R

near()

near() is a handy function that checks if two numbers are very close to each other. The function returns TRUE where the == operator would yield false because of finite precision arithmetic.

library(dplyr)

sqrt(7) ^ 2   == 7
# FALSE

1 / 49  * 49  == 1
# FALSE

near( sqrt(7)^2 , 7)
# TRUE

near(1 / 49 * 49, 1)
# TRUE

Github repository about-r, path: /packages/dplyr/near.R

case when

The function case_when mimics the SQL statement case when.

library(dplyr)

values <- c(9.1, 3.4, 5.6, 2.5, 7.9);

texts <- case_when(
          values <  0  ~   '<0' ,
          values <  3  ~   '<3' ,
          values <  5  ~  '3-5' ,
          values <  8  ~  '5-8' ,
          values < 10  ~ '8-10' ,
          TRUE         ~   '>10'
);

print(texts);
#
#  "8-10" "3-5"  "5-8"  "<3"   "5-8"

Github repository about-r, path: /packages/dplyr/case_when.R

See also the switch statement combined with findInterval(…).

show_query

If dplyr is used to access databases via DBI rather than data frames, dplyr will dymaically create SQL statements.

In such cases, show_query(…) can then be used to display the generated SQL statement:

library(dplyr  )
library(RSQLite)

sqlLite <- dbConnect(RSQLite::SQLite(), ':memory:');

# df <- data.frame(
#    col_one   = c('foo', 'bar', 'baz'  ,'foo' , 'foo' , 'baz', 'bar'  , 'bar'  ),
#    col_two   = c('one', 'two', 'three','four', 'five', 'six', 'seven', 'eight'),
#    col_three = c( 1   ,  2   ,  3     , 4    ,  5    ,  6   ,  7     ,  8     )
# );

dbWriteTable(sqlLite, 'tab', data.frame(
   col_one   = c('foo', 'bar', 'baz'  ,'foo' , 'foo' , 'baz', 'bar'  , 'bar'  ),
   col_two   = c('one', 'two', 'three','four', 'five', 'six', 'seven', 'eight'),
   col_three = c( 1   ,  2   ,  3     , 4    ,  5    ,  6   ,  7     ,  8     )
));

ref_tibble <- tbl(sqlLite, 'tab');

query <- ref_tibble                            %>%
         filter    (col_three %% 3 > 0      )  %>%
         group_by  (col_one                 )  %>%
         summarise (min_two = min(col_two),
                    max_two = max(col_two)  )

show_query(query);
#
#   <SQL>
#   SELECT `col_one`, MIN(`col_two`) AS `min_two`, MAX(`col_two`) AS `max_two`
#   FROM `tab`
#   WHERE (`col_three` % 3.0 > 0.0)
#   GROUP BY `col_one`


dbDisconnect(sqlLite);

Github repository about-r, path: /packages/dplyr/sql/show_query.R