From 46749fd26414583d6657d39a452d4ad1ee6ecd61 Mon Sep 17 00:00:00 2001 From: pducolin <45568537+pducolin@users.noreply.github.com> Date: Fri, 27 Dec 2024 15:01:32 +0100 Subject: [PATCH] [e2e] mark TestGPU as flake (#32534) --- test/new-e2e/go.mod | 2 +- test/new-e2e/tests/gpu/gpu_test.go | 2 ++ test/new-e2e/tests/gpu/provisioner.go | 24 ++++++++++++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/test/new-e2e/go.mod b/test/new-e2e/go.mod index 2a0752d059c9d..a23fccb9754f0 100644 --- a/test/new-e2e/go.mod +++ b/test/new-e2e/go.mod @@ -216,7 +216,7 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/pulumi/appdash v0.0.0-20231130102222-75f619a67231 // indirect github.com/pulumi/esc v0.10.0 // indirect - github.com/pulumi/pulumi-command/sdk v1.0.1 // indirect + github.com/pulumi/pulumi-command/sdk v1.0.1 github.com/pulumi/pulumi-docker/sdk/v4 v4.5.7 // indirect github.com/pulumi/pulumi-libvirt/sdk v0.5.3 // indirect github.com/pulumi/pulumi-random/sdk/v4 v4.16.7 // indirect diff --git a/test/new-e2e/tests/gpu/gpu_test.go b/test/new-e2e/tests/gpu/gpu_test.go index 0c71a1fe7f74c..6ba889a89550b 100644 --- a/test/new-e2e/tests/gpu/gpu_test.go +++ b/test/new-e2e/tests/gpu/gpu_test.go @@ -42,6 +42,8 @@ func dockerImageName() string { // TestGPUSuite runs tests for the VM interface to ensure its implementation is correct. // Not to be run in parallel, as some tests wait until the checks are available. func TestGPUSuite(t *testing.T) { + // incident-33572 + flake.Mark(t) provParams := getDefaultProvisionerParams() // Append our vectorAdd image for testing diff --git a/test/new-e2e/tests/gpu/provisioner.go b/test/new-e2e/tests/gpu/provisioner.go index 40b6d034c3d93..678b069eec31e 100644 --- a/test/new-e2e/tests/gpu/provisioner.go +++ b/test/new-e2e/tests/gpu/provisioner.go @@ -11,13 +11,15 @@ import ( "github.com/pulumi/pulumi/sdk/v3/go/pulumi" + "github.com/pulumi/pulumi-command/sdk/go/command/remote" + "github.com/DataDog/test-infra-definitions/common/utils" "github.com/DataDog/test-infra-definitions/components/command" "github.com/DataDog/test-infra-definitions/components/datadog/agent" "github.com/DataDog/test-infra-definitions/components/datadog/agentparams" "github.com/DataDog/test-infra-definitions/components/docker" "github.com/DataDog/test-infra-definitions/components/os" - "github.com/DataDog/test-infra-definitions/components/remote" + componentsremote "github.com/DataDog/test-infra-definitions/components/remote" "github.com/DataDog/test-infra-definitions/resources/aws" "github.com/DataDog/test-infra-definitions/scenarios/aws/ec2" "github.com/DataDog/test-infra-definitions/scenarios/aws/fakeintake" @@ -151,6 +153,20 @@ func gpuInstanceProvisioner(params *provisionerParams) provisioners.Provisioner if err != nil { return fmt.Errorf("validateDockerCuda failed: %w", err) } + // incident-33572: log the output of the CUDA validation command + pulumi.All(dockerCudaValidateCmd.Stdout, dockerCudaValidateCmd.Stderr).ApplyT(func(outputs []string) error { + stdout := outputs[0] + stderr := outputs[1] + err := ctx.Log.Info(fmt.Sprintf("Docker CUDA validation stdout: %s", stdout), nil) + if err != nil { + return err + } + err = ctx.Log.Info(fmt.Sprintf("Docker CUDA validation stderr: %s", stderr), nil) + if err != nil { + return err + } + return nil + }) // Combine agent options from the parameters with the fakeintake and docker dependencies params.agentOptions = append(params.agentOptions, @@ -177,7 +193,7 @@ func gpuInstanceProvisioner(params *provisionerParams) provisioners.Provisioner } // validateGPUDevices checks that there are GPU devices present and accesible -func validateGPUDevices(e aws.Environment, vm *remote.Host) ([]pulumi.Resource, error) { +func validateGPUDevices(e aws.Environment, vm *componentsremote.Host) ([]pulumi.Resource, error) { commands := map[string]string{ "pci": fmt.Sprintf("lspci -d %s:: | grep NVIDIA", nvidiaPCIVendorID), "driver": "lsmod | grep nvidia", @@ -203,7 +219,7 @@ func validateGPUDevices(e aws.Environment, vm *remote.Host) ([]pulumi.Resource, return cmds, nil } -func downloadDockerImages(e aws.Environment, vm *remote.Host, images []string, dependsOn ...pulumi.Resource) ([]pulumi.Resource, error) { +func downloadDockerImages(e aws.Environment, vm *componentsremote.Host, images []string, dependsOn ...pulumi.Resource) ([]pulumi.Resource, error) { var cmds []pulumi.Resource for i, image := range images { @@ -224,7 +240,7 @@ func downloadDockerImages(e aws.Environment, vm *remote.Host, images []string, d return cmds, nil } -func validateDockerCuda(e aws.Environment, vm *remote.Host, dependsOn ...pulumi.Resource) (pulumi.Resource, error) { +func validateDockerCuda(e aws.Environment, vm *componentsremote.Host, dependsOn ...pulumi.Resource) (*remote.Command, error) { return vm.OS.Runner().Command( e.CommonNamer().ResourceName("docker-cuda-validate"), &command.Args{