Skip to content

Commit

Permalink
Add support for Google Batch used specified boot images (#5268)
Browse files Browse the repository at this point in the history
The default boot image is batch-cos, but it may not be desired in all
situations. In particular, there is an ongoing issue in which the
batch-cos image does not retry pulling a docker image if there was a
network issue. The user may also have some some other configuration
pre-configured in their custom boot image.

This commit adds the ability to specify a custom boot disk image by using 
the configuration option 

```
google.batch.bootDiskImage = '<NAME>'
```


Signed-off-by: Siddhartha Bagaria <[email protected]>
Signed-off-by: Paolo Di Tommaso <[email protected]>
Co-authored-by: Paolo Di Tommaso <[email protected]>
  • Loading branch information
siddharthab and pditommaso authored Sep 3, 2024
1 parent b3ba2c2 commit 0aaa648
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 7 deletions.
5 changes: 5 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,11 @@ The following settings are available for Google Cloud Batch:
:::
: Define the set of allowed locations for VMs to be provisioned. See [Google documentation](https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#locationpolicy) for details (default: no restriction).

`google.batch.bootDiskImage`
: :::{versionadded} 24.08.0-edge
:::
: Set the image URI of the virtual machine boot disk, e.g `batch-debian`. See [Google documentation](https://cloud.google.com/batch/docs/vm-os-environment-overview#vm-os-image-options) for details (default: none).

`google.batch.bootDiskSize`
: Set the size of the virtual machine boot disk, e.g `50.GB` (default: none).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
instancePolicyOrTemplate.setInstallGpuDrivers(true)
}

if( executor.config.getBootDiskImage() )
instancePolicy.setBootDisk( AllocationPolicy.Disk.newBuilder().setImage( executor.config.getBootDiskImage() ) )

if( fusionEnabled() && !disk ) {
disk = new DiskResource(request: '375 GB', type: 'local-ssd')
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adding local volume as fusion scratch: $disk"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class BatchConfig {
private GoogleOpts googleOpts
private GoogleCredentials credentials
private List<String> allowedLocations
private String bootDiskImage
private MemoryUnit bootDiskSize
private String cpuPlatform
private int maxSpotAttempts
Expand All @@ -54,6 +55,7 @@ class BatchConfig {
GoogleOpts getGoogleOpts() { return googleOpts }
GoogleCredentials getCredentials() { return credentials }
List<String> getAllowedLocations() { allowedLocations }
String getBootDiskImage() { bootDiskImage }
MemoryUnit getBootDiskSize() { bootDiskSize }
String getCpuPlatform() { cpuPlatform }
int getMaxSpotAttempts() { maxSpotAttempts }
Expand All @@ -72,6 +74,7 @@ class BatchConfig {
result.googleOpts = GoogleOpts.create(session)
result.credentials = result.googleOpts.credentials
result.allowedLocations = session.config.navigate('google.batch.allowedLocations', List.of()) as List<String>
result.bootDiskImage = session.config.navigate('google.batch.bootDiskImage')
result.bootDiskSize = session.config.navigate('google.batch.bootDiskSize') as MemoryUnit
result.cpuPlatform = session.config.navigate('google.batch.cpuPlatform')
result.maxSpotAttempts = session.config.navigate('google.batch.maxSpotAttempts', DEFAULT_MAX_SPOT_ATTEMPTS) as int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class GoogleBatchTaskHandlerTest extends Specification {
!instancePolicy.getMachineType()
!instancePolicy.getMinCpuPlatform()
instancePolicy.getProvisioningModel().toString() == 'PROVISIONING_MODEL_UNSPECIFIED'
!instancePolicy.getBootDisk().getImage()
and:
allocationPolicy.getLocation().getAllowedLocationsCount() == 0
allocationPolicy.getNetwork().getNetworkInterfacesCount() == 0
Expand All @@ -121,6 +122,7 @@ class GoogleBatchTaskHandlerTest extends Specification {
and:
def ACCELERATOR = new AcceleratorResource(request: 1, type: 'nvidia-tesla-v100')
def BOOT_DISK = MemoryUnit.of('10 GB')
def BOOT_IMAGE = 'batch-debian'
def CONTAINER_IMAGE = 'ubuntu:22.1'
def CONTAINER_OPTS = '--this --that'
def CPU_PLATFORM = 'Intel Skylake'
Expand All @@ -134,6 +136,7 @@ class GoogleBatchTaskHandlerTest extends Specification {
getConfig() >> Mock(BatchConfig) {
getAllowedLocations() >> ['zones/us-central1-a', 'zones/us-central1-c']
getBootDiskSize() >> BOOT_DISK
getBootDiskImage() >> BOOT_IMAGE
getCpuPlatform() >> CPU_PLATFORM
getMaxSpotAttempts() >> 5
getSpot() >> true
Expand Down Expand Up @@ -211,6 +214,7 @@ class GoogleBatchTaskHandlerTest extends Specification {
and:
instancePolicy.getAccelerators(0).getCount() == 1
instancePolicy.getAccelerators(0).getType() == ACCELERATOR.type
instancePolicy.getBootDisk().getImage() == BOOT_IMAGE
instancePolicy.getDisks(0).getNewDisk().getSizeGb() == DISK.request.toGiga()
instancePolicy.getDisks(0).getNewDisk().getType() == DISK.type
instancePolicy.getMachineType() == MACHINE_TYPE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package nextflow.cloud.google.batch.client

import nextflow.Session
import nextflow.util.MemoryUnit
import spock.lang.Requires
import spock.lang.Specification
/**
Expand All @@ -29,21 +30,20 @@ class BatchConfigTest extends Specification {
@Requires({System.getenv('GOOGLE_APPLICATION_CREDENTIALS')})
def 'should create batch config' () {
given:
def CONFIG = [google: [
batch: [
spot: true
]
] ]
def CONFIG = [:]
def session = Mock(Session) { getConfig()>>CONFIG }

when:
def config = BatchConfig.create(session)
then:
config.getSpot()
!config.getSpot()
and:
config.retryConfig.maxAttempts == 5
config.maxSpotAttempts == 0
config.autoRetryExitCodes == [50001]
and:
!config.bootDiskImage
!config.bootDiskSize
}

@Requires({System.getenv('GOOGLE_APPLICATION_CREDENTIALS')})
Expand All @@ -54,7 +54,9 @@ class BatchConfigTest extends Specification {
spot: true,
maxSpotAttempts: 8,
autoRetryExitCodes: [50001, 50003, 50005],
retryPolicy: [maxAttempts: 10]
retryPolicy: [maxAttempts: 10],
bootDiskImage: 'batch-foo',
bootDiskSize: '100GB'
]
] ]
def session = Mock(Session) { getConfig()>>CONFIG }
Expand All @@ -67,6 +69,9 @@ class BatchConfigTest extends Specification {
config.retryConfig.maxAttempts == 10
config.maxSpotAttempts == 8
config.autoRetryExitCodes == [50001, 50003, 50005]
and:
config.bootDiskImage == 'batch-foo'
config.bootDiskSize == MemoryUnit.of('100GB')
}

}

0 comments on commit 0aaa648

Please sign in to comment.