Skip to content

Commit

Permalink
[e2e] mark TestGPU as flake (DataDog#32534)
Browse files Browse the repository at this point in the history
  • Loading branch information
pducolin authored Dec 27, 2024
1 parent 663520b commit 46749fd
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
2 changes: 1 addition & 1 deletion test/new-e2e/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ require (
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/pulumi/appdash v0.0.0-20231130102222-75f619a67231 // indirect
github.com/pulumi/esc v0.10.0 // indirect
github.com/pulumi/pulumi-command/sdk v1.0.1 // indirect
github.com/pulumi/pulumi-command/sdk v1.0.1
github.com/pulumi/pulumi-docker/sdk/v4 v4.5.7 // indirect
github.com/pulumi/pulumi-libvirt/sdk v0.5.3 // indirect
github.com/pulumi/pulumi-random/sdk/v4 v4.16.7 // indirect
Expand Down
2 changes: 2 additions & 0 deletions test/new-e2e/tests/gpu/gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ func dockerImageName() string {
// TestGPUSuite runs tests for the VM interface to ensure its implementation is correct.
// Not to be run in parallel, as some tests wait until the checks are available.
func TestGPUSuite(t *testing.T) {
// incident-33572
flake.Mark(t)
provParams := getDefaultProvisionerParams()

// Append our vectorAdd image for testing
Expand Down
24 changes: 20 additions & 4 deletions test/new-e2e/tests/gpu/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ import (

"github.com/pulumi/pulumi/sdk/v3/go/pulumi"

"github.com/pulumi/pulumi-command/sdk/go/command/remote"

"github.com/DataDog/test-infra-definitions/common/utils"
"github.com/DataDog/test-infra-definitions/components/command"
"github.com/DataDog/test-infra-definitions/components/datadog/agent"
"github.com/DataDog/test-infra-definitions/components/datadog/agentparams"
"github.com/DataDog/test-infra-definitions/components/docker"
"github.com/DataDog/test-infra-definitions/components/os"
"github.com/DataDog/test-infra-definitions/components/remote"
componentsremote "github.com/DataDog/test-infra-definitions/components/remote"
"github.com/DataDog/test-infra-definitions/resources/aws"
"github.com/DataDog/test-infra-definitions/scenarios/aws/ec2"
"github.com/DataDog/test-infra-definitions/scenarios/aws/fakeintake"
Expand Down Expand Up @@ -151,6 +153,20 @@ func gpuInstanceProvisioner(params *provisionerParams) provisioners.Provisioner
if err != nil {
return fmt.Errorf("validateDockerCuda failed: %w", err)
}
// incident-33572: log the output of the CUDA validation command
pulumi.All(dockerCudaValidateCmd.Stdout, dockerCudaValidateCmd.Stderr).ApplyT(func(outputs []string) error {
stdout := outputs[0]
stderr := outputs[1]
err := ctx.Log.Info(fmt.Sprintf("Docker CUDA validation stdout: %s", stdout), nil)
if err != nil {
return err
}
err = ctx.Log.Info(fmt.Sprintf("Docker CUDA validation stderr: %s", stderr), nil)
if err != nil {
return err
}
return nil
})

// Combine agent options from the parameters with the fakeintake and docker dependencies
params.agentOptions = append(params.agentOptions,
Expand All @@ -177,7 +193,7 @@ func gpuInstanceProvisioner(params *provisionerParams) provisioners.Provisioner
}

// validateGPUDevices checks that there are GPU devices present and accesible
func validateGPUDevices(e aws.Environment, vm *remote.Host) ([]pulumi.Resource, error) {
func validateGPUDevices(e aws.Environment, vm *componentsremote.Host) ([]pulumi.Resource, error) {
commands := map[string]string{
"pci": fmt.Sprintf("lspci -d %s:: | grep NVIDIA", nvidiaPCIVendorID),
"driver": "lsmod | grep nvidia",
Expand All @@ -203,7 +219,7 @@ func validateGPUDevices(e aws.Environment, vm *remote.Host) ([]pulumi.Resource,
return cmds, nil
}

func downloadDockerImages(e aws.Environment, vm *remote.Host, images []string, dependsOn ...pulumi.Resource) ([]pulumi.Resource, error) {
func downloadDockerImages(e aws.Environment, vm *componentsremote.Host, images []string, dependsOn ...pulumi.Resource) ([]pulumi.Resource, error) {
var cmds []pulumi.Resource

for i, image := range images {
Expand All @@ -224,7 +240,7 @@ func downloadDockerImages(e aws.Environment, vm *remote.Host, images []string, d
return cmds, nil
}

func validateDockerCuda(e aws.Environment, vm *remote.Host, dependsOn ...pulumi.Resource) (pulumi.Resource, error) {
func validateDockerCuda(e aws.Environment, vm *componentsremote.Host, dependsOn ...pulumi.Resource) (*remote.Command, error) {
return vm.OS.Runner().Command(
e.CommonNamer().ResourceName("docker-cuda-validate"),
&command.Args{
Expand Down

0 comments on commit 46749fd

Please sign in to comment.