Search notes:
R package: dplyr
SQL like access
The arguably six most important functions of
dplyr provide
SQL like access to
dataframes . These functions are
filter()
to select observations by their values
arrange()
to reorder rows
mutate()
to create new variables with functions of existing values
summarise()
to create summaries of values
select()
to select variables by their names
group_by()
The first argument in these functions is always a
data frame and these functions return a (new) data frame.
library(dplyr)
df_orig <- data.frame (
id = c( 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ),
num = c( 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'),
val = c( 8.32 , 5.41 , 0.01 , 7.71 , 2.18 , 7.35 , 8.69 , 4.72 , 12.68 , 5.31 )
)
#
# select
# id as ident,
# num as num_english,
# val as price
# from
# df_orig
rename(df_orig, ident = id, num_english = num, price = val)
#
# create table
# df_val_between_3_and_10 as
# select
# *
# from
# df_orig
# where
# val > 3 and
# val <= 10;
#
#
df_val_between_3_and_10 <- filter(df_orig, val > 3, val <= 10)
#
# create table
# df_val_ordered as
# select
# *
# from
# df_val_between_3_and_10
# order by
# val;
#
df_val_ordered <- arrange(df_val_between_3_and_10, val)
#
# create table
# df_with_val_squared as
# select
# *
# from
# df_val_between_3_and_10
# order by
# val;
#
df_with_val_squared <- mutate(df_val_ordered, val_squared = val * 2)
#
# select
# round(val),
# min (val),
# max (val),
# avg (val_squared)
# from
# df_with_val_squared
# group by
# round(val);
#
#
summarise (
group_by(df_with_val_squared, round(val)),
min(val),
max(val),
mean(val_squared)
)
Count a categorical value
A
data frame is piped (
%>%
) into the
dplyr
function
count
to count the occurences of each value (that is: the
levels ) of the categorical variable (aka
factor ):
library(dplyr)
df <- data.frame (
item = c('foo', 'bar', 'foo', 'foo', 'bar', 'foo', 'baz', 'bar'),
val = c( 4 , 7 , 3 , 3 , 5 , 6 , 5 , 4 )
)
df %>% count(item)
#
# # A tibble: 3 x 2
# item n
# <fct> <int>
# 1 bar 3
# 2 baz 1
# 3 foo 4
#
Count a continuous variable
Using the
cut_width()
function of
ggplot2
, a continuous variable can be cut into equally sized widths and then be counted:
library(dplyr)
library(ggplot2)
df <- read.table (
header = TRUE,
text = 'item val
foo 17
bar 39
foo 22
bar 18
foo 31
baz 24
foo 24
bar 19
baz 17
foo 21
bar 31
baz 22
bar 11
foo 23'
)
df %>% count(cut_width(val, 5))
#
# # A tibble: 6 x 2
# `cut_width(val, 5)` n
# <fct> <int>
# 1 [7.5,12.5] 1
# 2 (12.5,17.5] 2
# 3 (17.5,22.5] 5
# 4 (22.5,27.5] 3
# 5 (27.5,32.5] 2
# 6 (37.5,42.5] 1
#
Assign unique number to rows
mutate()
in combination with row_number()
allows to assign a unique number (id) to each row in a data set. In the following example, the new column is named id
.
library(dplyr)
df <- data.frame(
col_1 = c('foo', 'bar', 'foo', 'baz'),
col_2 = c( 7.2 , 1.8 , 2.0 , 4.1 )
)
df %>%
mutate(id = row_number()) %>% # Assign (unique) id with row_number
select(id, col_1, col_2 ) # Re-order columns
filter
As there is a
between
operator in
SQL , there is also a
between
in dplyr:
library(dplyr)
df <- data.frame(
col_1 = c('one', 'two', 'three', 'four', 'five'),
col_2 = c( 7.2 , 4.1 , 1.8 , 2.0 , 6.2 )
)
filter(df,
between(col_2, 2, 7)
)
near()
near()
is a handy function that checks if two numbers are very close to each other. The function returns TRUE
where the ==
operator would yield false because of finite precision arithmetic.
library(dplyr)
sqrt(7) ^ 2 == 7
# FALSE
1 / 49 * 49 == 1
# FALSE
near( sqrt(7)^2 , 7)
# TRUE
near(1 / 49 * 49, 1)
# TRUE
case when
The function
case_when
mimics the
SQL statement
case when
.
library(dplyr)
values <- c(9.1, 3.4, 5.6, 2.5, 7.9);
texts <- case_when(
values < 0 ~ '<0' ,
values < 3 ~ '<3' ,
values < 5 ~ '3-5' ,
values < 8 ~ '5-8' ,
values < 10 ~ '8-10' ,
TRUE ~ '>10'
);
print(texts);
#
# "8-10" "3-5" "5-8" "<3" "5-8"
show_query
In such cases, show_query(…)
can then be used to display the generated SQL statement:
library(dplyr )
library(RSQLite)
sqlLite <- dbConnect(RSQLite::SQLite(), ':memory:');
# df <- data.frame(
# col_one = c('foo', 'bar', 'baz' ,'foo' , 'foo' , 'baz', 'bar' , 'bar' ),
# col_two = c('one', 'two', 'three','four', 'five', 'six', 'seven', 'eight'),
# col_three = c( 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 )
# );
dbWriteTable(sqlLite, 'tab', data.frame(
col_one = c('foo', 'bar', 'baz' ,'foo' , 'foo' , 'baz', 'bar' , 'bar' ),
col_two = c('one', 'two', 'three','four', 'five', 'six', 'seven', 'eight'),
col_three = c( 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 )
));
ref_tibble <- tbl(sqlLite, 'tab');
query <- ref_tibble %>%
filter (col_three %% 3 > 0 ) %>%
group_by (col_one ) %>%
summarise (min_two = min(col_two),
max_two = max(col_two) )
show_query(query);
#
# <SQL>
# SELECT `col_one`, MIN(`col_two`) AS `min_two`, MAX(`col_two`) AS `max_two`
# FROM `tab`
# WHERE (`col_three` % 3.0 > 0.0)
# GROUP BY `col_one`
dbDisconnect(sqlLite);