--- title: "Introduction" output: html_document: toc: true vignette: > %\VignetteIndexEntry{maditr: Introduction} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{utf8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) data.table::setDTthreads(2) ``` ## Links - [maditr on CRAN](https://cran.r-project.org/package=maditr) - [maditr on Github](https://github.com/gdemin/maditr) - [Issues](https://github.com/gdemin/maditr/issues) ## Overview Package provides pipe-style interface for [data.table](https://cran.r-project.org/package=data.table) package. It preserves all data.table features without significant impact on performance. `let` and `take` functions are simplified interfaces for most common data manipulation tasks. - To select rows from data: `rows(mtcars, am==0)` - To select columns from data: `columns(mtcars, mpg, vs:carb)` - To aggregate data: `take(mtcars, mean_mpg = mean(mpg), by = am)` - To aggregate all non-grouping columns: `take_all(mtcars, mean, by = am)` - To aggregate several columns with one summary: `take(mtcars, mpg, hp, fun = mean, by = am)` - To get total summary skip `by` argument: `take_all(mtcars, mean)` - Use magrittr pipe `%>%` to chain several operations: ```{r, eval=FALSE} mtcars %>% let(mpg_hp = mpg/hp) %>% take(mean(mpg_hp), by = am) ``` - To modify variables or add new variables: ```{r, eval=FALSE} mtcars %>% let(new_var = 42, new_var2 = new_var*hp) %>% head() ``` - To drop variable assign NULL: `let(mtcars, am = NULL) %>% head()` - To modify all non-grouping variables: ```{r, eval=FALSE} iris %>% let_all( scaled = (.x - mean(.x))/sd(.x), by = Species) %>% head() ``` - To aggregate all variables conditionally on name: ```{r, eval=FALSE} iris %>% take_all( mean = if(startsWith(.name, "Sepal")) mean(.x), median = if(startsWith(.name, "Petal")) median(.x), by = Species ) ``` - For parametric assignment use `:=`: ```{r, eval=FALSE} new_var = "my_var" old_var = "mpg" mtcars %>% let((new_var) := get(old_var)*2) %>% head() # or, expr = quote(mean(cyl)) mtcars %>% let((new_var) := eval(expr)) %>% head() # the same with `take` by_var = "vs,am" take(mtcars, (new_var) := eval(expr), by = by_var) ``` `query_if` function translates its arguments one-to-one to `[.data.table` method. Additionally there are some conveniences such as automatic `data.frame` conversion to `data.table`. ## vlookup & xlookup Let's make datasets for lookups: ```{r include=FALSE} library(maditr) ``` ```{r} workers = fread(" name company Nick Acme John Ajax Daniela Ajax ") positions = fread(" name position John designer Daniela engineer Cathie manager ") # xlookup workers = let(workers, position = xlookup(name, positions$name, positions$position) ) # vlookup # by default we search in the first column and return values from second column workers = let(workers, position = vlookup(name, positions, no_match = "Not found") ) # the same workers = let(workers, position = vlookup(name, positions, result_column = "position", no_match = "Not found") # or, result_column = 2 ) head(workers) ``` ## More examples We will use for demonstartion well-known `mtcars` dataset and some examples from `dplyr` package. ```{r} library(maditr) data(mtcars) # Newly created variables are available immediately mtcars %>% let( cyl2 = cyl * 2, cyl4 = cyl2 * 2 ) %>% head() # You can also use let() to remove variables and # modify existing variables mtcars %>% let( mpg = NULL, disp = disp * 0.0163871 # convert to litres ) %>% head() # window functions are useful for grouped computations mtcars %>% let(rank = rank(-mpg, ties.method = "min"), by = cyl) %>% head() # You can drop variables by setting them to NULL mtcars %>% let(cyl = NULL) %>% head() # keeps all existing variables mtcars %>% let(displ_l = disp / 61.0237) %>% head() # keeps only the variables you create mtcars %>% take(displ_l = disp / 61.0237) %>% head() # can refer to both contextual variables and variable names: var = 100 mtcars %>% let(cyl = cyl * var) %>% head() # select rows mtcars %>% rows(am==0) %>% head() # select rows with compound condition mtcars %>% rows(am==0 & mpg>mean(mpg)) # select columns mtcars %>% columns(vs:carb, cyl) mtcars %>% columns(-am, -cyl) # regular expression pattern columns(iris, "^Petal") %>% head() # variables which start from 'Petal' columns(iris, "Width$") %>% head() # variables which end with 'Width' # move Species variable to the front # pattern "^." matches all variables columns(iris, Species, "^.") %>% head() # pattern "^.*al" means "contains 'al'" columns(iris, "^.*al") %>% head() # numeric indexing - all variables except Species columns(iris, 1:4) %>% head() # A 'take' with summary functions applied without 'by' argument returns an aggregated data mtcars %>% take(mean = mean(disp), n = .N) # Usually, you'll want to group first mtcars %>% take(mean = mean(disp), n = .N, by = am) # grouping by multiple variables mtcars %>% take(mean = mean(disp), n = .N, by = list(am, vs)) # You can group by expressions: mtcars %>% take_all( mean, by = list(vsam = vs + am) ) # modify all non-grouping variables in-place mtcars %>% let_all((.x - mean(.x))/sd(.x), by = am) %>% head() # modify all non-grouping variables to new variables mtcars %>% let_all(scaled = (.x - mean(.x))/sd(.x), by = am) %>% head() # conditionally modify all variables iris %>% let_all(mean = if(is.numeric(.x)) mean(.x)) %>% head() # modify all variables conditionally on name iris %>% let_all( mean = if(startsWith(.name, "Sepal")) mean(.x), median = if(startsWith(.name, "Petal")) median(.x), by = Species ) %>% head() # aggregation with 'take_all' mtcars %>% take_all(mean = mean(.x), sd = sd(.x), n = .N, by = am) # conditionally aggregate all variables iris %>% take_all(mean = if(is.numeric(.x)) mean(.x)) # aggregate all variables conditionally on name iris %>% take_all( mean = if(startsWith(.name, "Sepal")) mean(.x), median = if(startsWith(.name, "Petal")) median(.x), by = Species ) # parametric evaluation: var = quote(mean(cyl)) mtcars %>% let(mean_cyl = eval(var)) %>% head() take(mtcars, eval(var)) # all together new_var = "mean_cyl" mtcars %>% let((new_var) := eval(var)) %>% head() take(mtcars, (new_var) := eval(var)) ``` ## Variable selection in the expressions You can use 'columns' inside expression in the 'take'/'let'. 'columns' will be replaced with data.table with selected columns. In 'let' in the expressions with ':=', 'cols' or '%to%' can be placed in the left part of the expression. It is usefull for multiple assignment. There are four ways of column selection: 1. Simply by column names 2. By variable ranges, e. g. vs:carb. Alternatively, you can use '%to%' instead of colon: 'vs %to% carb'. 3. With regular expressions. Characters which start with '^' or end with $ considered as Perl-style regular expression patterns. For example, '^Petal' returns all variables started with 'Petal'. 'Width$' returns all variables which end with 'Width'. Pattern '^.' matches all variables and pattern '^.*my_str' is equivalent to contains "my_str"'. 4. By character variables with interpolated parts. Expression in the curly brackets inside characters will be evaluated in the parent frame with 'text_expand' function. For example, `a{1:3}` will be transformed to the names 'a1', 'a2', 'a3'. 'cols' is just a shortcut for 'columns'. ```{r} # range selection iris %>% let( avg = rowMeans(Sepal.Length %to% Petal.Width) ) %>% head() # multiassignment iris %>% let( # starts with Sepal or Petal multipled1 %to% multipled4 := cols("^(Sepal|Petal)")*2 ) %>% head() mtcars %>% let( # text expansion cols("scaled_{names(mtcars)}") := lapply(cols("{names(mtcars)}"), scale) ) %>% head() # range selection in 'by' # selection of range + additional column mtcars %>% take( res = sum(cols(mpg, disp %to% drat)), by = vs %to% gear ) ``` ## Joins Here we use the same datasets as with lookups: ```{r} workers = fread(" name company Nick Acme John Ajax Daniela Ajax ") positions = fread(" name position John designer Daniela engineer Cathie manager ") workers positions ``` Different kinds of joins: ```{r} workers %>% dt_inner_join(positions) workers %>% dt_left_join(positions) workers %>% dt_right_join(positions) workers %>% dt_full_join(positions) # filtering joins workers %>% dt_anti_join(positions) workers %>% dt_semi_join(positions) ``` To suppress the message, supply `by` argument: ```{r, eval=FALSE} workers %>% dt_left_join(positions, by = "name") ``` Use a named `by` if the join variables have different names: ```{r, eval=FALSE} positions2 = setNames(positions, c("worker", "position")) # rename first column in 'positions' workers %>% dt_inner_join(positions2, by = c("name" = "worker")) ``` ## 'dplyr'-like interface for data.table. There are a small subset of 'dplyr' verbs to work with data.table. Note that there is no `group_by` verb - use by or keyby argument when needed. - `dt_mutate` adds new variables or modify existing variables. If data is data.table then it modifies in-place. - `dt_summarize` computes summary statistics. Splits the data into subsets, computes summary statistics for each, and returns the result in the "data.table" form. - `dt_summarize_all` the same as `dt_summarize` but work over all non-grouping variables. - `dt_filter` Selects rows/cases where conditions are true. Rows where the condition evaluates to NA are dropped. - `dt_select` Selects column/variables from the data set. Range of variables are supported, e. g. `vs:carb`. Characters which start with `^` or end with `\$` considered as Perl-style regular expression patterns. For example, `'^Petal'` returns all variables started with 'Petal'. `'Width\$'` returns all variables which end with 'Width'. Pattern `^.` matches all variables and pattern `'^.*my_str'` is equivalent to contains `"my_str"`. See examples. - `dt_arrange` sorts dataset by variable(-s). Use '-' to sort in desending order. If data is data.table then it modifies in-place. The same examples with 'dplyr'-verbs: ```{r} # examples from 'dplyr' # newly created variables are available immediately mtcars %>% dt_mutate( cyl2 = cyl * 2, cyl4 = cyl2 * 2 ) %>% head() # you can also use dt_mutate() to remove variables and # modify existing variables mtcars %>% dt_mutate( mpg = NULL, disp = disp * 0.0163871 # convert to litres ) %>% head() # window functions are useful for grouped mutates mtcars %>% dt_mutate( rank = rank(-mpg, ties.method = "min"), keyby = cyl) %>% print() # You can drop variables by setting them to NULL mtcars %>% dt_mutate(cyl = NULL) %>% head() # A summary applied without by returns a single row mtcars %>% dt_summarise(mean = mean(disp), n = .N) # Usually, you'll want to group first mtcars %>% dt_summarise(mean = mean(disp), n = .N, by = cyl) # Multiple 'by' - variables mtcars %>% dt_summarise(cyl_n = .N, by = list(cyl, vs)) # Newly created summaries immediately # doesn't overwrite existing variables mtcars %>% dt_summarise(disp = mean(disp), sd = sd(disp), by = cyl) # You can group by expressions: mtcars %>% dt_summarise_all(mean, by = list(vsam = vs + am)) # filter by condition mtcars %>% dt_filter(am==0) # filter by compound condition mtcars %>% dt_filter(am==0, mpg>mean(mpg)) # select mtcars %>% dt_select(vs:carb, cyl) %>% head() mtcars %>% dt_select(-am, -cyl) %>% head() # regular expression pattern dt_select(iris, "^Petal") %>% head() # variables which start from 'Petal' dt_select(iris, "Width$") %>% head() # variables which end with 'Width' # move Species variable to the front # pattern "^." matches all variables dt_select(iris, Species, "^.") %>% head() # pattern "^.*al" means "contains 'al'" dt_select(iris, "^.*al") %>% head() dt_select(iris, 1:4) %>% head() # numeric indexing - all variables except Species # sorting dt_arrange(mtcars, cyl, disp) dt_arrange(mtcars, -disp) ```