Parallelize on local machine
library (future)
options (future.demo.mandelbrot.resolution = 5000 ) # 5000x5000 px
demo ("mandelbrot" , ask = FALSE )
plan(sequential)
- default
plan(multisession)
plan(multicore)
Parallelize on local machine
Standalone background R processes
plan (multisession)
plan (multisession, workers = availableCores ()) ## default
plan (multisession, workers = 2 )
Real-world example:
library (future.apply)
plan (multisession, workers = 3 )
Warning in checkNumberOfLocalWorkers(workers): Careful, you are setting up 3
localhost parallel workers with only 2 CPU cores available for this R process
(per 'system'), which could result in a 150% load. The soft limit is set to
100%. Overusing the CPUs has negative impact on the current R process, but also
on all other processes of yours and others running on the same machine. See
help("parallelly.options", package = "parallelly") for how to override the soft
and hard limits
info <- future_lapply (seq_len (nbrOfWorkers ()), function (idx) {
data.frame (idx = idx, hostname = Sys.info ()[["nodename" ]], pid = Sys.getpid ())
})
info <- do.call (rbind, info)
info
idx hostname pid
1 1 fv-az802-287 18859
2 2 fv-az802-287 18861
3 3 fv-az802-287 18860
Forked R processes
plan (multicore)
plan (multicore, workers = availableCores ()) ## default
plan (multicore, workers = 2 )
library (future.apply)
plan (multicore, workers = 3 )
info <- future_lapply (seq_len (nbrOfWorkers ()), function (idx) {
data.frame (idx = idx, hostname = Sys.info ()[["nodename" ]], pid = Sys.getpid ())
})
info <- do.call (rbind, info)
info
idx hostname pid
1 1 fv-az802-287 18984
2 2 fv-az802-287 18985
3 3 fv-az802-287 18986
Parallelize on multiple machines
plan (cluster)
plan (cluster, workers = availableWorkers ()) ## default
plan (cluster, workers = c ("dev1" , "dev2" , "dev3" ))
Real-world example:
library (future.apply)
plan (cluster, workers = c ("dev1" , "dev2" , "dev3" , "dev3" , "dev3" ))
info <- future_lapply (seq_len (nbrOfWorkers ()), function (idx) {
data.frame (idx = idx, hostname = Sys.info ()[["nodename" ]], pid = Sys.getpid ())
})
info <- do.call (rbind, info)
info
idx hostname pid
1 1 dev1.wynton.ucsf.edu 281122
2 2 dev2.wynton.ucsf.edu 29646
3 3 dev3.wynton.ucsf.edu 43826
4 4 dev3.wynton.ucsf.edu 43881
5 5 dev3.wynton.ucsf.edu 43921
Parallelize via HPC job scheduler
plan (batchtools_slurm) ## Slurm cluster
plan (batchtools_sge) ## SGE cluster
library (future.apply)
plan (future.batchtools:: batchtools_sge, workers = 3 )
info <- future_lapply (seq_len (nbrOfWorkers ()), function (idx) {
data.frame (idx = idx, hostname = Sys.info ()[["nodename" ]], pid = Sys.getpid ())
})
If we peek at the job scheduler queue right after calling future_lapply()
, we would see something like:
$ qstat
job-ID prior name user state time
------------------------------------------------------------
279873 0.000 future_lapply_1 hb qw 04/26/2023 22:46:54
279889 0.000 future_lapply_2 hb qw 04/26/2023 22:47:02
279912 0.000 future_lapply_3 hb qw 04/26/2023 22:47:13
These three jobs represent the three futures we created. When completed, we will see something like:
info <- do.call (rbind, info)
info
idx hostname pid
1 1 qb3-as92 14596
2 2 qb3-as17 48397
3 3 qb3-as04 8698
Alternatives
The future.callr package parallelizes on the local machine using the callr package. It works similarly to multisession
, but can use more than 125 parallel workers:
library (future.callr)
plan (callr)
plan (callr, workers = availableCores ()) ## default
plan (callr, workers = 2 )