From 5f43156545e39caf9842e4d29d99cf6f8cb15e35 Mon Sep 17 00:00:00 2001 From: wlandau Date: Wed, 10 Jul 2024 08:33:27 -0400 Subject: [PATCH] retry_tasks --- DESCRIPTION | 4 ++-- NEWS.md | 4 ++++ R/crew_controller_lsf.R | 4 +++- R/crew_controller_pbs.R | 4 +++- R/crew_controller_sge.R | 4 +++- R/crew_controller_slurm.R | 4 +++- README.Rmd | 2 +- README.md | 2 +- man/crew_controller_lsf.Rd | 8 ++++++++ man/crew_controller_pbs.Rd | 8 ++++++++ man/crew_controller_sge.Rd | 8 ++++++++ man/crew_controller_slurm.Rd | 8 ++++++++ tests/testthat/test-crew_monitor_slurm.R | 3 +++ 13 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 tests/testthat/test-crew_monitor_slurm.R diff --git a/DESCRIPTION b/DESCRIPTION index 58fe579..bce6bec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,7 @@ Description: In computationally demanding analysis projects, 'clustermq' by Schubert (2019) ), and 'batchtools' by Lang, Bischl, and Surmann (2017). . -Version: 0.3.1 +Version: 0.3.2 License: MIT + file LICENSE URL: https://wlandau.github.io/crew.cluster/, https://github.com/wlandau/crew.cluster @@ -49,7 +49,7 @@ Authors@R: c( Depends: R (>= 4.0.0) Imports: - crew (>= 0.8.0), + crew (>= 0.9.5), ps, lifecycle, R6, diff --git a/NEWS.md b/NEWS.md index f8af4be..3182723 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# crew.cluster 0.3.2 + +* Add `retry_tasks` argument. + # crew.cluster 0.3.1 * Add a SLURM monitor (#32, @brendanf). diff --git a/R/crew_controller_lsf.R b/R/crew_controller_lsf.R index f6c3801..b2817a3 100644 --- a/R/crew_controller_lsf.R +++ b/R/crew_controller_lsf.R @@ -37,6 +37,7 @@ crew_controller_lsf <- function( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -74,7 +75,8 @@ crew_controller_lsf <- function( tls_enable = tls_enable, tls_config = tls_config, seconds_interval = seconds_interval, - seconds_timeout = seconds_timeout + seconds_timeout = seconds_timeout, + retry_tasks = retry_tasks ) launcher <- crew_launcher_lsf( name = name, diff --git a/R/crew_controller_pbs.R b/R/crew_controller_pbs.R index 412aee0..4953771 100644 --- a/R/crew_controller_pbs.R +++ b/R/crew_controller_pbs.R @@ -33,6 +33,7 @@ crew_controller_pbs <- function( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -71,7 +72,8 @@ crew_controller_pbs <- function( tls_enable = tls_enable, tls_config = tls_config, seconds_interval = seconds_interval, - seconds_timeout = seconds_timeout + seconds_timeout = seconds_timeout, + retry_tasks = retry_tasks ) launcher <- crew_launcher_pbs( name = name, diff --git a/R/crew_controller_sge.R b/R/crew_controller_sge.R index c9ab4c0..f3357c5 100644 --- a/R/crew_controller_sge.R +++ b/R/crew_controller_sge.R @@ -33,6 +33,7 @@ crew_controller_sge <- function( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -73,7 +74,8 @@ crew_controller_sge <- function( tls_enable = tls_enable, tls_config = tls_config, seconds_interval = seconds_interval, - seconds_timeout = seconds_timeout + seconds_timeout = seconds_timeout, + retry_tasks = retry_tasks ) launcher <- crew_launcher_sge( name = name, diff --git a/R/crew_controller_slurm.R b/R/crew_controller_slurm.R index c9ae2bd..6106b88 100644 --- a/R/crew_controller_slurm.R +++ b/R/crew_controller_slurm.R @@ -37,6 +37,7 @@ crew_controller_slurm <- function( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -74,7 +75,8 @@ crew_controller_slurm <- function( tls_enable = tls_enable, tls_config = tls_config, seconds_interval = seconds_interval, - seconds_timeout = seconds_timeout + seconds_timeout = seconds_timeout, + retry_tasks = retry_tasks ) launcher <- crew_launcher_slurm( name = name, diff --git a/README.Rmd b/README.Rmd index 811e2b1..36a168f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -51,7 +51,7 @@ controller <- crew_controller_sge( controller$start() ``` -At this point, usage is exactly the same as basic [`crew`](https://wlandau.github.io/crew). The `push()` method submits tasks and auto-scales SGE workers to meet demand. +At this point, usage is exactly the same as basic [`crew`](https://wlandau.github.io/crew/). The `push()` method submits tasks and auto-scales SGE workers to meet demand. ```{r} controller$push(name = "do work", command = do_work()) diff --git a/README.md b/README.md index 7aed521..4bfee46 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ controller$start() ``` At this point, usage is exactly the same as basic -[`crew`](https://wlandau.github.io/crew). The `push()` method submits +[`crew`](https://wlandau.github.io/crew/). The `push()` method submits tasks and auto-scales SGE workers to meet demand. ``` r diff --git a/man/crew_controller_lsf.Rd b/man/crew_controller_lsf.Rd index ccfb228..e0645ef 100644 --- a/man/crew_controller_lsf.Rd +++ b/man/crew_controller_lsf.Rd @@ -19,6 +19,7 @@ crew_controller_lsf( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -92,6 +93,13 @@ See the \code{walltime} argument of \code{mirai::daemon()}.} \item{seconds_exit}{Deprecated on 2023-09-21 in version 0.1.2.9000. No longer necessary.} +\item{retry_tasks}{\code{TRUE} to automatically retry a task in the event of +an unexpected worker exit. \code{FALSE} to give up on the first exit and +return a \code{mirai} error code (code number 19). +\code{TRUE} (default) is recommended in most situations. +Use \code{FALSE} for debugging purposes, e.g. to confirm that a task +is causing a worker to run out of memory or crash in some other way.} + \item{tasks_max}{Maximum number of tasks that a worker will do before exiting. See the \code{maxtasks} argument of \code{mirai::daemon()}. \code{crew} does not diff --git a/man/crew_controller_pbs.Rd b/man/crew_controller_pbs.Rd index fe7f82c..c602083 100644 --- a/man/crew_controller_pbs.Rd +++ b/man/crew_controller_pbs.Rd @@ -19,6 +19,7 @@ crew_controller_pbs( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -93,6 +94,13 @@ See the \code{walltime} argument of \code{mirai::daemon()}.} \item{seconds_exit}{Deprecated on 2023-09-21 in version 0.1.2.9000. No longer necessary.} +\item{retry_tasks}{\code{TRUE} to automatically retry a task in the event of +an unexpected worker exit. \code{FALSE} to give up on the first exit and +return a \code{mirai} error code (code number 19). +\code{TRUE} (default) is recommended in most situations. +Use \code{FALSE} for debugging purposes, e.g. to confirm that a task +is causing a worker to run out of memory or crash in some other way.} + \item{tasks_max}{Maximum number of tasks that a worker will do before exiting. See the \code{maxtasks} argument of \code{mirai::daemon()}. \code{crew} does not diff --git a/man/crew_controller_sge.Rd b/man/crew_controller_sge.Rd index 83743b1..a91a821 100644 --- a/man/crew_controller_sge.Rd +++ b/man/crew_controller_sge.Rd @@ -19,6 +19,7 @@ crew_controller_sge( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -95,6 +96,13 @@ See the \code{walltime} argument of \code{mirai::daemon()}.} \item{seconds_exit}{Deprecated on 2023-09-21 in version 0.1.2.9000. No longer necessary.} +\item{retry_tasks}{\code{TRUE} to automatically retry a task in the event of +an unexpected worker exit. \code{FALSE} to give up on the first exit and +return a \code{mirai} error code (code number 19). +\code{TRUE} (default) is recommended in most situations. +Use \code{FALSE} for debugging purposes, e.g. to confirm that a task +is causing a worker to run out of memory or crash in some other way.} + \item{tasks_max}{Maximum number of tasks that a worker will do before exiting. See the \code{maxtasks} argument of \code{mirai::daemon()}. \code{crew} does not diff --git a/man/crew_controller_slurm.Rd b/man/crew_controller_slurm.Rd index 7a9178b..a9b541f 100644 --- a/man/crew_controller_slurm.Rd +++ b/man/crew_controller_slurm.Rd @@ -19,6 +19,7 @@ crew_controller_slurm( seconds_idle = Inf, seconds_wall = Inf, seconds_exit = NULL, + retry_tasks = TRUE, tasks_max = Inf, tasks_timers = 0L, reset_globals = TRUE, @@ -92,6 +93,13 @@ See the \code{walltime} argument of \code{mirai::daemon()}.} \item{seconds_exit}{Deprecated on 2023-09-21 in version 0.1.2.9000. No longer necessary.} +\item{retry_tasks}{\code{TRUE} to automatically retry a task in the event of +an unexpected worker exit. \code{FALSE} to give up on the first exit and +return a \code{mirai} error code (code number 19). +\code{TRUE} (default) is recommended in most situations. +Use \code{FALSE} for debugging purposes, e.g. to confirm that a task +is causing a worker to run out of memory or crash in some other way.} + \item{tasks_max}{Maximum number of tasks that a worker will do before exiting. See the \code{maxtasks} argument of \code{mirai::daemon()}. \code{crew} does not diff --git a/tests/testthat/test-crew_monitor_slurm.R b/tests/testthat/test-crew_monitor_slurm.R new file mode 100644 index 0000000..b613fed --- /dev/null +++ b/tests/testthat/test-crew_monitor_slurm.R @@ -0,0 +1,3 @@ +test_that("crew_monitor_slurm() validate", { + expect_silent(crew_monitor_slurm()$validate()) +})