Quickstart examples

Drake has small self-contained built-in examples. To see the names of the available examples, use

examples_drake()
## [1] "basic"

Then use example_drake() to write the files for the example to your working directory. This vignette walks through the "basic" example, for which you can get the code with

example_drake("basic")

Setting up the basic example

Load your libraries first. Drake will detect loaded packages and reload them on all your compute nodes, if applicable.

library(knitr)
library(rmarkdown)
library(drake)

This example is a simulation study, and we're using a function to simulate random datasets. Enter your simulation function and drake will import it automatically.

simulate = function(n){
  data.frame(
    x = rnorm(n),
    y = rpois(n, 1)
  )
}

For each dataset we simulate, we'll apply a bunch of methods of analysis.

reg1 = function(d){
  lm(y ~ + x, data = d)
}

reg2 = function(d){
  d$x2 = d$x^2
  lm(y ~ x2, data = d)
}

After we're done, we'll want to knit a dynamic R Markdown report with a bunch of results.

my_knit = function(file, ...){
  knit(file) # drake knows you loaded the knitr package
}

my_render = function(file, ...){
  render(file) # drake knows you loaded the rmarkdown package
}

The example provides an example report.Rmd, which uses readd() and loadd() to load objects we'll generate with drake.

# Write the R Markdown source for a dynamic knitr report
lines = c(
  "---",
  "title: Example Report",
  "author: You",
  "output: html_document",
  "---",
  "",
  "Look how I read outputs from the drake cache.",
  "",
  "```{r example_chunk}",
  "library(drake)",
  "readd(small)",
  "readd(coef_regression2_small)",
  "loadd(large)",
  "head(large)",
  "```"
)

writeLines(lines, "report.Rmd")

The workflow plan

In drake, your workflow plan is organized into a data frame. Each row represents a target, which is either a variable or a file that will be produced with a single command. Here is the part of the plan that generates our datasets.

datasets = plan(
  small = simulate(5),
  large = simulate(50))
datasets
##   target      command
## 1  small  simulate(5)
## 2  large simulate(50)

Commands need not be function calls. They can be any kind R expression except for formulas with ~ and function definitions. If I want multiple replicates, I can just use expand, but let's just stick to our two datasets here.

expand(datasets, values = c("rep1", "rep2"))
##       target      command
## 1 small_rep1  simulate(5)
## 2 small_rep2  simulate(5)
## 3 large_rep1 simulate(50)
## 4 large_rep2 simulate(50)

To plan my analyses, we first declare the methods we will use.

methods = plan(
  regression1 = reg1(..dataset..),
  regression2 = reg2(..dataset..))
methods
##        target           command
## 1 regression1 reg1(..dataset..)
## 2 regression2 reg2(..dataset..)

The wildcard placeholder ..dataset.. says to substitute the names of our datasets one at a time in our actual analysis plan.

analyses = analyses(methods, data = datasets)
analyses
##              target     command
## 1 regression1_small reg1(small)
## 2 regression1_large reg1(large)
## 3 regression2_small reg2(small)
## 4 regression2_large reg2(large)

Now, we should summarize each analysis of each dataset a few different ways.

summary_types = plan(summ = summary(..analysis..),
                     coef = coef(..analysis..))
summary_types
##   target               command
## 1   summ summary(..analysis..)
## 2   coef    coef(..analysis..)
results = summaries(summary_types, analyses, datasets, 
  gather = NULL)
results
##                   target                    command
## 1 summ_regression1_small summary(regression1_small)
## 2 summ_regression1_large summary(regression1_large)
## 3 summ_regression2_small summary(regression2_small)
## 4 summ_regression2_large summary(regression2_large)
## 5 coef_regression1_small    coef(regression1_small)
## 6 coef_regression1_large    coef(regression1_large)
## 7 coef_regression2_small    coef(regression2_small)
## 8 coef_regression2_large    coef(regression2_large)

The gather argument of summaries is used to group summaries together by type, and I am skipping it here to make the workflow plan data frames more readable. The ..analysis.. wildcard acts similarly to the ..dataset.. wildcard. Functions analyses() and summaries() make use of evaluate() and gather() behind the scenes, which you can use them directly for added flexibility.

For the dynamic report, we have to tell drake which targets will be loaded into the embedded R chunks. That way, when the targets change, the report will automatically rebuild.

load_in_report = plan(
  report_dependencies = c(small, large, coef_regression2_small))
load_in_report
##                target                                 command
## 1 report_dependencies c(small, large, coef_regression2_small)

In the commands to render the report, keep in mind the rule for working with files: use single quotes to declare external file targets and dependencies, and use double quotes to remove any special meaning from character strings. Single quotes inside any custom functions are ignored, so this mechanism only works for your workflow plan data frame. To help automate the correct quoting, you may wish to use the functions quotes(), unquote(), and strings() from the eply package. Also, please be aware that drake cannot track entire directories (folders).

report = plan(
  report.md = my_knit('report.Rmd', report_dependencies),
## The html report requires pandoc. Commented out.
## report.html = my_render('report.md', report_dependencies),
  file_targets = TRUE, strings_in_dots = "filenames")
report
##        target                                    command
## 1 'report.md' my_knit('report.Rmd', report_dependencies)

To finish planning your full workflow, use rbind() to piece all your commands together. Row order does not matter here. Drake knows which commands to run first.

plan = rbind(report, datasets, load_in_report, analyses, results)
plan
##                    target                                    command
## 1             'report.md' my_knit('report.Rmd', report_dependencies)
## 2                   small                                simulate(5)
## 3                   large                               simulate(50)
## 4     report_dependencies    c(small, large, coef_regression2_small)
## 5       regression1_small                                reg1(small)
## 6       regression1_large                                reg1(large)
## 7       regression2_small                                reg2(small)
## 8       regression2_large                                reg2(large)
## 9  summ_regression1_small                 summary(regression1_small)
## 10 summ_regression1_large                 summary(regression1_large)
## 11 summ_regression2_small                 summary(regression2_small)
## 12 summ_regression2_large                 summary(regression2_large)
## 13 coef_regression1_small                    coef(regression1_small)
## 14 coef_regression1_large                    coef(regression1_large)
## 15 coef_regression2_small                    coef(regression2_small)
## 16 coef_regression2_large                    coef(regression2_large)

Use the tracked() function to list which objects, functions, files, targets, etc. that drake tries to reporducibly track. (I say “tries” because if you mention a symbol in a command or function but drake cannot find it, drake will skip it, and if verbose is TRUE, you will be notified.) Drake is not perfect, and it can miss dependencies in some edge cases, so you should inspect the output of tracked(). (See also build_graph() and plot_graph() to inspect the dependency tree of your project.)

"small" %in% tracked(plan)
## [1] TRUE
tracked(plan, targets = "small")
## [1] "small"      "simulate"   "data.frame" "rnorm"      "rpois"
tracked(plan)
##  [1] "'report.md'"            "small"                 
##  [3] "large"                  "report_dependencies"   
##  [5] "regression1_small"      "regression1_large"     
##  [7] "regression2_small"      "regression2_large"     
##  [9] "summ_regression1_small" "summ_regression1_large"
## [11] "summ_regression2_small" "summ_regression2_large"
## [13] "coef_regression1_small" "coef_regression1_large"
## [15] "coef_regression2_small" "coef_regression2_large"
## [17] "my_knit"                "simulate"              
## [19] "reg1"                   "reg2"                  
## [21] "'report.Rmd'"           "c"                     
## [23] "summary"                "coef"                  
## [25] "knit"                   "data.frame"            
## [27] "rnorm"                  "rpois"                 
## [29] "lm"

Check your workflow plan for other errors and pitfalls, such as circularities and possibly missed file dependencies.

check(plan)

Run the workflow in a single process

Use make(plan) to run your workflow.

make(plan)
## import 'report.Rmd'
## import c
## import summary
## import coef
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import reg2
## import reg1
## import simulate
## import my_knit
## build small
## build large
## build regression1_small
## build regression1_large
## build regression2_small
## build regression2_large
## build summ_regression1_small
## build summ_regression1_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression1_small
## build coef_regression1_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'
## processing file: report.Rmd
## output file: report.md

Use readd() and loadd() to see the targets you generated. (They are stored in the hidden .drake/ folder using storr). Other functions interact and view the cache.

readd(coef_regression2_large)
## (Intercept)          x2 
##   0.9214072  -0.0321417
loadd(small)
head(small)
##            x y
## 1 0.68623897 1
## 2 0.47225869 2
## 3 0.24119171 2
## 4 1.56944167 0
## 5 0.04091244 1
rm(small)
cached(small, large)
## small large 
##  TRUE  TRUE
cached()
##  [1] "'report.Rmd'"           "'report.md'"           
##  [3] "c"                      "coef"                  
##  [5] "coef_regression1_large" "coef_regression1_small"
##  [7] "coef_regression2_large" "coef_regression2_small"
##  [9] "data.frame"             "knit"                  
## [11] "large"                  "lm"                    
## [13] "my_knit"                "reg1"                  
## [15] "reg2"                   "regression1_large"     
## [17] "regression1_small"      "regression2_large"     
## [19] "regression2_small"      "report_dependencies"   
## [21] "rnorm"                  "rpois"                 
## [23] "simulate"               "small"                 
## [25] "summ_regression1_large" "summ_regression1_small"
## [27] "summ_regression2_large" "summ_regression2_small"
## [29] "summary"
built()
##  [1] "'report.md'"            "coef_regression1_large"
##  [3] "coef_regression1_small" "coef_regression2_large"
##  [5] "coef_regression2_small" "large"                 
##  [7] "regression1_large"      "regression1_small"     
##  [9] "regression2_large"      "regression2_small"     
## [11] "report_dependencies"    "small"                 
## [13] "summ_regression1_large" "summ_regression1_small"
## [15] "summ_regression2_large" "summ_regression2_small"
imported()
##  [1] "'report.Rmd'" "c"            "coef"         "data.frame"  
##  [5] "knit"         "lm"           "my_knit"      "reg1"        
##  [9] "reg2"         "rnorm"        "rpois"        "simulate"    
## [13] "summary"
head(read_plan())
##                target                                    command
## 1         'report.md' my_knit('report.Rmd', report_dependencies)
## 2               small                                simulate(5)
## 3               large                               simulate(50)
## 4 report_dependencies    c(small, large, coef_regression2_small)
## 5   regression1_small                                reg1(small)
## 6   regression1_large                                reg1(large)
# read_graph() # reads/plots the tree structure of your workflow plan
head(status()) # last call to make()
##           'report.Rmd'            'report.md'                      c 
##             "finished"             "finished"             "finished" 
##                   coef coef_regression1_large coef_regression1_small 
##             "finished"             "finished"             "finished"
status(large)
##      large 
## "finished"

The next time you run make(plan), nothing will be built because drake knows everything is up to date.

make(plan)
## import 'report.Rmd'
## import c
## import summary
## import coef
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import reg2
## import reg1
## import simulate
## import my_knit

But if you change one of your functions, commands, or other dependencies, drake will update the affected parts of the workflow. Let's say we want to change the quadratic term to a cubic term in our reg2() function.

reg2 = function(d){
  d$x3 = d$x^3
  lm(y ~ x3, data = d)
}

Voila! Targets depending on reg2() are updated, and those depending only on reg1() are left alone.

make(plan)
## import 'report.Rmd'
## import c
## import summary
## import coef
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import reg2
## import reg1
## import simulate
## import my_knit
## build regression2_small
## build regression2_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'
## processing file: report.Rmd
## output file: report.md

But trivial changes such whitespace and comments are totally ignored in your functions and in plan$command.

reg2 = function(d){
  d$x3 = d$x^3
    lm(y ~ x3, data = d) # I indented here.
}
make(plan) 
## import 'report.Rmd'
## import c
## import summary
## import coef
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import reg2
## import reg1
## import simulate
## import my_knit

Need to add new work on the fly?

Just append rows to the workflow plan. If the rest of your workflow is up to date, only the new work is run.

new_simulation = function(n){
  data.frame(x = rnorm(n), y = rnorm(n))
}

additions = plan(
  new_data = new_simulation(36) + sqrt(10))  
additions
##     target                       command
## 1 new_data new_simulation(36) + sqrt(10)
plan = rbind(plan, additions)
plan
##                    target                                    command
## 1             'report.md' my_knit('report.Rmd', report_dependencies)
## 2                   small                                simulate(5)
## 3                   large                               simulate(50)
## 4     report_dependencies    c(small, large, coef_regression2_small)
## 5       regression1_small                                reg1(small)
## 6       regression1_large                                reg1(large)
## 7       regression2_small                                reg2(small)
## 8       regression2_large                                reg2(large)
## 9  summ_regression1_small                 summary(regression1_small)
## 10 summ_regression1_large                 summary(regression1_large)
## 11 summ_regression2_small                 summary(regression2_small)
## 12 summ_regression2_large                 summary(regression2_large)
## 13 coef_regression1_small                    coef(regression1_small)
## 14 coef_regression1_large                    coef(regression1_large)
## 15 coef_regression2_small                    coef(regression2_small)
## 16 coef_regression2_large                    coef(regression2_large)
## 17               new_data              new_simulation(36) + sqrt(10)
make(plan)
## import 'report.Rmd'
## import c
## import summary
## import coef
## import sqrt
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import reg2
## import reg1
## import simulate
## import my_knit
## import new_simulation
## build new_data

To clean up, use clean(). Any targets removed from the cache will have to be rebuilt on the next call to make(), so only clean if you are sure you will not lose anything important.

clean(small, reg1) # uncaches individual targets and imported objects
clean() # cleans all targets out of the cache
clean(destroy = TRUE) # removes the cache entirely

High-performance parallel computing

Within a single R session and a single compute node, you can spread your work over multiple parallel processes. Select the type of parallel computing with the parallelism argument to make(), and select the maximum number of parallel tasks with the jobs argument. Set parallelism = "parLapply" (default for Windows) to use parallel::parLapply() in the backend. This approach works on most (if not all) platforms, but setting up the local cluster at the beginning takes extra time. Users with non-Windows platforms can set parallelism = "mclapply" (non-Windows default) to use parallel::mclapply() on the backend, which requires less overhead.

make(plan, jobs = 2) # parallelism == "parLapply" by default.
make(plan, parallelism = "mclapply", jobs = 2) # not for Windows
readd(coef_regression2_large)

Alternatively, set parallelism = "Makefile" to spread targets over multiple parallel R sessions. This gets into true distributed computing. Windows users will need to download and install Rtools. The following are equivalent.

make(plan, parallelism = "Makefile", jobs = 4, verbose = FALSE)
make(plan, parallelism = "Makefile", command = "make", args = "--jobs=4 --silent") 

To distribute those Makefile jobs over multiple nodes on a cluster or supercomputer, you may need a shell.sh file like the following

#!/bin/bash
shift
echo "module load R; $*" | qsub -sync y -cwd -j y

You may need to replace module load R with a command to load a specific version of R. Next, put your main code, including your call to make(plan, ...), inside an R script such as script.R.

To run your workflow on the cluster, use the Linux terminal to enter the following.

nohup nice -19 R CMD BATCH script.R &

Even after you log out, a background process will remain on the login node to submit new jobs through Make as new targets become ready.

A warning about the Makefile

The Makefile generated by make(plan, parallelism = "Makefile") is not standalone. Do not run it outside of drake::make(). Drake uses dummy timestamp files to tell the Makefile what to do, and running make in the terminal will most likely give incorrect results.

Flexible generation of workflow plans

More flexibility for generating workflow plans

If your workflow does not fit the rigid datasets/analyses/summaries framework, check out functions expand(), evaluate(), and gather().

df = plan(data = simulate(center = MU, scale = SIGMA))
df
##   target                              command
## 1   data simulate(center = MU, scale = SIGMA)
df = expand(df, values = c("rep1", "rep2"))
df
##      target                              command
## 1 data_rep1 simulate(center = MU, scale = SIGMA)
## 2 data_rep2 simulate(center = MU, scale = SIGMA)
evaluate(df, wildcard = "MU", values = 1:2)
##        target                             command
## 1 data_rep1_1 simulate(center = 1, scale = SIGMA)
## 2 data_rep1_2 simulate(center = 2, scale = SIGMA)
## 3 data_rep2_1 simulate(center = 1, scale = SIGMA)
## 4 data_rep2_2 simulate(center = 2, scale = SIGMA)
evaluate(df, wildcard = "MU", values = 1:2, expand = FALSE)
##      target                             command
## 1 data_rep1 simulate(center = 1, scale = SIGMA)
## 2 data_rep2 simulate(center = 2, scale = SIGMA)
evaluate(df, rules = list(MU = 1:2, SIGMA = c(0.1, 1)), expand = FALSE)
##      target                           command
## 1 data_rep1 simulate(center = 1, scale = 0.1)
## 2 data_rep2   simulate(center = 2, scale = 1)
evaluate(df, rules = list(MU = 1:2, SIGMA = c(0.1, 1, 10)))
##             target                           command
## 1  data_rep1_1_0.1 simulate(center = 1, scale = 0.1)
## 2    data_rep1_1_1   simulate(center = 1, scale = 1)
## 3   data_rep1_1_10  simulate(center = 1, scale = 10)
## 4  data_rep1_2_0.1 simulate(center = 2, scale = 0.1)
## 5    data_rep1_2_1   simulate(center = 2, scale = 1)
## 6   data_rep1_2_10  simulate(center = 2, scale = 10)
## 7  data_rep2_1_0.1 simulate(center = 1, scale = 0.1)
## 8    data_rep2_1_1   simulate(center = 1, scale = 1)
## 9   data_rep2_1_10  simulate(center = 1, scale = 10)
## 10 data_rep2_2_0.1 simulate(center = 2, scale = 0.1)
## 11   data_rep2_2_1   simulate(center = 2, scale = 1)
## 12  data_rep2_2_10  simulate(center = 2, scale = 10)
gather(df)
##   target                                            command
## 1 target list(data_rep1 = data_rep1, data_rep2 = data_rep2)
gather(df, target = "my_summaries", gather = "rbind")
##         target                                             command
## 1 my_summaries rbind(data_rep1 = data_rep1, data_rep2 = data_rep2)