Storage

Caching, hashing, and customization

William Michael Landau

2017-10-17

1 Storage basics

When you run make(), drake puts your imports and output targets in a hidden cache, or storage location.

library(drake)
load_basic_example()
config <- make(my_plan, verbose = FALSE, return_config = TRUE)
## Warning in make(my_plan, verbose = FALSE, return_config = TRUE):
## The return_config argument to make() is deprecated. Now, an internal
## configuration list is always invisibly returned.

You can explore your cached data using functions like loadd(), readd(), and cached().

head(cached())
## [1] "'report.Rmd'"           "'report.md'"           
## [3] "coef_regression1_large" "coef_regression1_small"
## [5] "coef_regression2_large" "coef_regression2_small"
readd(small)
##            x y
## 1 -0.8405761 1
## 2  0.3728460 1
## 3 -0.9197519 4
## 4 -0.8138454 4
## 5  0.6081448 0
loadd(large)
head(large)
##             x y
## 1 -1.04907427 1
## 2  1.48356981 0
## 3 -0.98970865 1
## 4  0.04199456 0
## 5 -0.26483752 0
## 6 -1.02497511 0
rm(large) # Does not remove `large` from the cache.

2 Caches as R objects

The storr package does the heavy lifting. A storr is an object in R that serves as an abstraction for a storage backend, usually a file system. See the main storr vignette for a thorough walkthrough.

class(config$cache) # from `config <- make(..., return_config = TRUE)`
## [1] "storr" "R6"
cache <- get_cache() # Get the default cache from the last build.
class(cache)
## [1] "storr" "R6"
cache$list() # functionality from storr
##  [1] "'report.Rmd'"           "'report.md'"           
##  [3] "coef_regression1_large" "coef_regression1_small"
##  [5] "coef_regression2_large" "coef_regression2_small"
##  [7] "coefficients"           "data.frame"            
##  [9] "knit"                   "large"                 
## [11] "lm"                     "reg1"                  
## [13] "reg2"                   "regression1_large"     
## [15] "regression1_small"      "regression2_large"     
## [17] "regression2_small"      "rpois"                 
## [19] "simulate"               "small"                 
## [21] "stats::rnorm"           "summ_regression1_large"
## [23] "summ_regression1_small" "summ_regression2_large"
## [25] "summ_regression2_small" "summary"               
## [27] "suppressWarnings"
cache$get("small") # functionality from storr
## $type
## [1] "object"
## 
## $value
##            x y
## 1 -0.8405761 1
## 2  0.3728460 1
## 3 -0.9197519 4
## 4 -0.8138454 4
## 5  0.6081448 0
## 
## $imported
## [1] FALSE

3 Hash algorithms

The key to storr’s internals is the concept of hashing. Storr uses hashes to label what they store, and drake leverages these hashes to figure out what is up to date and what needs to be (re)built. A hash is like a fingerprint for a piece of data, so the hash should change if the dataset changes. Regardless of the data’s size, the hash always has same number of characters.

library(digest) # package for hashing objects and files
smaller_data <- 12
larger_data <- rnorm(1000)
digest(smaller_data) # compute the hash
## [1] "23c80a31c0713176016e6e18d76a5f31"
digest(larger_data)
## [1] "dcac878fe45f6220e8fb6194e67243da"

However, different hash algorithms vary in output length.

digest(larger_data, algo = "sha512")
## [1] "4665f744a15b53d289c95c6cfc9a5c63eec0321756d38b790398c916a42d4779e6e915b63913c30bdac38576eccca7630bc4793e34865f6666acea308dcc5de7"
digest(larger_data, algo = "md5")
## [1] "dcac878fe45f6220e8fb6194e67243da"
digest(larger_data, algo = "xxhash64")
## [1] "0779a0749e7b5165"
digest(larger_data, algo = "murmur32")
## [1] "1ba7b6d5"

4 Which hash algorithm should you choose?

Hashing is expensive, and unsurprisingly, shorter hashes are usually faster to compute. So why not always use murmur32? One reason is the risk of collisions: when two different objects have the same hash. In general, shorter hashes have higher risks of collisions. We want our fingerprints to be unique. On the other hand, a longer hash is not always the answer. Besides speed, the decision depends on how we use the output. Drake and storr both use hash keys as names for internal cache files, and in general, file names should respect the 260-character cap on Windows file paths. That is why drake uses a shorter hash algorithm for internal cache-related file names and a longer hash algorithm for everything else.

default_short_hash_algo() # for drake
## [1] "xxhash64"
default_long_hash_algo()
## [1] "sha256"
short_hash(cache)
## [1] "xxhash64"
long_hash(cache)
## [1] "sha256"

5 Select the hash algorithms of the default cache

For new projects, use new_cache() to set the hashes of the default cache.

cache_path(cache) # default cache from before
## [1] "/tmp/Rtmp3fe9pW/Rbuild24bf574dbaea/drake/vignettes/.drake"
clean(destroy = TRUE) # start from scratch to reset both hash algorithms
tmp <- new_cache(
  path = default_cache_path(), # the `.drake/` folder
  short_hash_algo = "crc32",
  long_hash_algo = "sha1"
)

The cache at default_cache_path() (equivalently, the .drake/ folder) is the default cache used for make().

config <- make(my_plan, verbose = FALSE, return_config = TRUE)
## Warning in make(my_plan, verbose = FALSE, return_config = TRUE):
## The return_config argument to make() is deprecated. Now, an internal
## configuration list is always invisibly returned.
short_hash(config$cache) # would have been xxhash64 (default_short_hash_algo())
## [1] "crc32"
long_hash(config$cache) # would have been sha256 (default_long_hash_algo())
## [1] "sha1"

You can change the long hash algorithm without throwing away the cache, but the project will rebuild from scratch. As for the short hash, you are committed until you delete the cache and its supporting files.

outdated(my_plan, verbose = FALSE) # empty
config$cache <- configure_cache(
  config$cache,
  long_hash_algo = "murmur32",
  overwrite_hash_algos = TRUE
)

Below, the targets become outdated because the existing hash keys do not match the new hash algorithm.

outdated(my_plan, verbose = FALSE)
##  [1] "'report.md'"            "coef_regression1_large"
##  [3] "coef_regression1_small" "coef_regression2_large"
##  [5] "coef_regression2_small" "large"                 
##  [7] "regression1_large"      "regression1_small"     
##  [9] "regression2_large"      "regression2_small"     
## [11] "small"                  "summ_regression1_large"
## [13] "summ_regression1_small" "summ_regression2_large"
## [15] "summ_regression2_small"
config <- make(my_plan, verbose = FALSE, return_config = TRUE)
## Warning in make(my_plan, verbose = FALSE, return_config = TRUE):
## The return_config argument to make() is deprecated. Now, an internal
## configuration list is always invisibly returned.
short_hash(config$cache) # same as before
## [1] "crc32"
long_hash(config$cache) # different from before
## [1] "murmur32"

6 More on custom caches

You do not need to use the default cache whose files are at default_cache_path() (.drake/). However, if you use a different file system, such as the custom faster_cache/ folder below, you will need to manually supply the cache to all functions that require one.

faster_cache <- new_cache(
  path = "faster_cache",
  short_hash_algo = "murmur32",
  long_hash_algo = "murmur32"
)
cache_path(faster_cache)
## [1] "/tmp/Rtmp3fe9pW/Rbuild24bf574dbaea/drake/vignettes/faster_cache"
cache_path(cache) # location of the previous cache
## [1] "/tmp/Rtmp3fe9pW/Rbuild24bf574dbaea/drake/vignettes/.drake"
short_hash(faster_cache)
## [1] "murmur32"
long_hash(faster_cache)
## [1] "murmur32"
new_plan <- plan(
  simple = 1 + 1
)
make(new_plan, cache = faster_cache)
## check 1 item: simple
## target simple
cached(cache = faster_cache)
## [1] "simple"
readd(simple, cache = faster_cache)
## [1] 2

7 Recovering the cache

You can recover an old cache from the file system. You could use storr::storr_rds() directly if you know the short hash algorithm, but this_cache() and recover_cache() are safer for drake.

old_cache <- this_cache("faste_cache") # Get a cache you know exists...
recovered <- recover_cache("faster_cache") # or create a new one if missing.

8 More on storr caches

If you want bypass drake and generate a cache directly from storr, it is best to do so right from the beginning.

library(storr)
my_storr <- storr_rds("my_storr", mangle_key = TRUE)
make(new_plan, cache = faster_cache)
## Unloading targets from environment:
##   simple
## check 1 item: simple
cached(cache = faster_cache)
## [1] "simple"
readd(simple, cache = faster_cache)
## [1] 2

Drake supports storr_rds() caches. Other caches may be possible, but they should have a storr-like API and namespace support.

9 In-memory caches

Some caches store your data in the computer’s memory rather than saved files. Drake can make use of these in-memory caches, but not with any kind of parallel computing. In other words, when you call make(), the parallelism argument cannot be "Makefile" and jobs must be 1 (default). Also, keep in mind that unless you save your workspace, your in-memory cache will disappear when you close your R session.

memory_cache <- storr_environment()
other_plan <- plan(
  some_data = rnorm(50),
  more_data = rpois(75, lambda = 10),
  result = mean(c(some_data, more_data))
)
make(other_plan, cache = memory_cache)
## check 4 items: rnorm, rpois, c, mean
## import rnorm
## import rpois
## import c
## import mean
## check 2 items: some_data, more_data
## target some_data
## target more_data
## check 1 item: result
## target result
cached(cache = memory_cache)
## [1] "c"         "mean"      "more_data" "result"    "rnorm"     "rpois"    
## [7] "some_data"
readd(result, cache = memory_cache)
## [1] 6.121254

10 Cache types

Drake has functions to help you create caches with known supported types.

default_cache_type()
## [1] "storr_rds"
cache_types()
## [1] "storr_rds"         "storr_environment"
in_memory_cache_types()
## [1] "storr_environment"
env <- new.env()
my_type <- new_cache(type = "storr_environment")
my_type_2 <- new_cache(type = "storr_environment", envir = env)
ls(env)
## [1] "data"           "hash_algorithm" "keys"

For new in-memory caches, please use new_cache() rather than get_cache() or recover_cache().

11 Cleaning up

If you want to start from sratch, you can clean() the cache. Use the destroy argument to remove it completely. cache$del() and cache$destroy() are also options, but they leave output file targets dangling. By contrast, clean(destroy = TRUE) removes file targets generated by drake::make().

clean(small, large)
cached() # 'small' and 'large' are gone
##  [1] "'report.Rmd'"           "'report.md'"           
##  [3] "coef_regression1_large" "coef_regression1_small"
##  [5] "coef_regression2_large" "coef_regression2_small"
##  [7] "coefficients"           "data.frame"            
##  [9] "knit"                   "lm"                    
## [11] "reg1"                   "reg2"                  
## [13] "regression1_large"      "regression1_small"     
## [15] "regression2_large"      "regression2_small"     
## [17] "rpois"                  "simulate"              
## [19] "stats::rnorm"           "summ_regression1_large"
## [21] "summ_regression1_small" "summ_regression2_large"
## [23] "summ_regression2_small" "summary"               
## [25] "suppressWarnings"
clean(destroy = TRUE)
clean(destroy = TRUE, cache = faster_cache)
clean(destroy = TRUE, cache = my_storr)