Skip to content

Commit

Permalink
Disable Google Batch automatic spot retries (#5223)
Browse files Browse the repository at this point in the history
This commit disables the automatic retry made by Google Batch when a spot instance is reclaimed.

The main reasons to disable this capability is:

* The same tasks can be re-tried multiple times incurring in significant spending increase with the user is a aware of that
* The Google automatic retry re-execute a task in the same working directory because it's not directly managed by nextflow. This can introduce nasty side effects with partial/corrupted data left in a previous execution
* There's not log/visual feedback during the pipeline execution, because it's managed directly by Google Batch.

User can still enable this capability by setting the following option:

```
google.batch.maxSpotAttempts = n 
```

where n is a integer > 0


Signed-off-by: Paolo Di Tommaso <[email protected]>
  • Loading branch information
pditommaso authored Aug 12, 2024
1 parent f28fcb2 commit aad2153
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 7 deletions.
5 changes: 4 additions & 1 deletion docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,10 @@ The following settings are available for Google Cloud Batch:
`google.batch.maxSpotAttempts`
: :::{versionadded} 23.11.0-edge
:::
: Max number of execution attempts of a job interrupted by a Compute Engine spot reclaim event (default: `5`).
: :::{versionchanged} 24.08.0-edge
The default value was changed from `5` to `0`.
:::
: Max number of execution attempts of a job interrupted by a Compute Engine spot reclaim event (default: `0`).
: See also: `google.batch.autoRetryExitCodes`

`google.project`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@

package nextflow.cloud.google.batch.client


import com.google.auth.oauth2.GoogleCredentials
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import nextflow.Session
import nextflow.cloud.google.GoogleOpts
import nextflow.exception.ProcessUnrecoverableException
import nextflow.util.MemoryUnit
/**
* Model Google Batch config settings
Expand All @@ -33,7 +31,9 @@ import nextflow.util.MemoryUnit
@CompileStatic
class BatchConfig {

static private List<Integer> DEFAULT_RETRY_LIST = List.of(50001)
static final private int DEFAULT_MAX_SPOT_ATTEMPTS = 0

static final private List<Integer> DEFAULT_RETRY_LIST = List.of(50001)

private GoogleOpts googleOpts
private GoogleCredentials credentials
Expand Down Expand Up @@ -74,7 +74,7 @@ class BatchConfig {
result.allowedLocations = session.config.navigate('google.batch.allowedLocations', List.of()) as List<String>
result.bootDiskSize = session.config.navigate('google.batch.bootDiskSize') as MemoryUnit
result.cpuPlatform = session.config.navigate('google.batch.cpuPlatform')
result.maxSpotAttempts = session.config.navigate('google.batch.maxSpotAttempts',5) as int
result.maxSpotAttempts = session.config.navigate('google.batch.maxSpotAttempts', DEFAULT_MAX_SPOT_ATTEMPTS) as int
result.installGpuDrivers = session.config.navigate('google.batch.installGpuDrivers',false)
result.preemptible = session.config.navigate('google.batch.preemptible',false)
result.spot = session.config.navigate('google.batch.spot',false)
Expand All @@ -83,7 +83,7 @@ class BatchConfig {
result.subnetwork = session.config.navigate('google.batch.subnetwork')
result.serviceAccountEmail = session.config.navigate('google.batch.serviceAccountEmail')
result.retryConfig = new BatchRetryConfig( session.config.navigate('google.batch.retryPolicy') as Map ?: Map.of() )
result.autoRetryExitCodes = session.config.navigate('google.batch.autoRetryExitCodes',DEFAULT_RETRY_LIST) as List<Integer>
result.autoRetryExitCodes = session.config.navigate('google.batch.autoRetryExitCodes', DEFAULT_RETRY_LIST) as List<Integer>
return result
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class BatchConfigTest extends Specification {
config.getSpot()
and:
config.retryConfig.maxAttempts == 5
config.maxSpotAttempts == 5
config.maxSpotAttempts == 0
config.autoRetryExitCodes == [50001]
}

Expand Down

0 comments on commit aad2153

Please sign in to comment.