Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[e2e] mark TestGPU as flake #32534

Merged
merged 3 commits into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/new-e2e/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ require (
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/pulumi/appdash v0.0.0-20231130102222-75f619a67231 // indirect
github.com/pulumi/esc v0.10.0 // indirect
github.com/pulumi/pulumi-command/sdk v1.0.1 // indirect
github.com/pulumi/pulumi-command/sdk v1.0.1
github.com/pulumi/pulumi-docker/sdk/v4 v4.5.7 // indirect
github.com/pulumi/pulumi-libvirt/sdk v0.5.3 // indirect
github.com/pulumi/pulumi-random/sdk/v4 v4.16.7 // indirect
Expand Down
2 changes: 2 additions & 0 deletions test/new-e2e/tests/gpu/gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ func dockerImageName() string {
// TestGPUSuite runs tests for the VM interface to ensure its implementation is correct.
// Not to be run in parallel, as some tests wait until the checks are available.
func TestGPUSuite(t *testing.T) {
// incident-33572
flake.Mark(t)
provParams := getDefaultProvisionerParams()

// Append our vectorAdd image for testing
Expand Down
24 changes: 20 additions & 4 deletions test/new-e2e/tests/gpu/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ import (

"github.com/pulumi/pulumi/sdk/v3/go/pulumi"

"github.com/pulumi/pulumi-command/sdk/go/command/remote"

"github.com/DataDog/test-infra-definitions/common/utils"
"github.com/DataDog/test-infra-definitions/components/command"
"github.com/DataDog/test-infra-definitions/components/datadog/agent"
"github.com/DataDog/test-infra-definitions/components/datadog/agentparams"
"github.com/DataDog/test-infra-definitions/components/docker"
"github.com/DataDog/test-infra-definitions/components/os"
"github.com/DataDog/test-infra-definitions/components/remote"
componentsremote "github.com/DataDog/test-infra-definitions/components/remote"
"github.com/DataDog/test-infra-definitions/resources/aws"
"github.com/DataDog/test-infra-definitions/scenarios/aws/ec2"
"github.com/DataDog/test-infra-definitions/scenarios/aws/fakeintake"
Expand Down Expand Up @@ -151,6 +153,20 @@ func gpuInstanceProvisioner(params *provisionerParams) provisioners.Provisioner
if err != nil {
return fmt.Errorf("validateDockerCuda failed: %w", err)
}
// incident-33572: log the output of the CUDA validation command
pulumi.All(dockerCudaValidateCmd.Stdout, dockerCudaValidateCmd.Stderr).ApplyT(func(outputs []string) error {
stdout := outputs[0]
stderr := outputs[1]
err := ctx.Log.Info(fmt.Sprintf("Docker CUDA validation stdout: %s", stdout), nil)
if err != nil {
return err
}
err = ctx.Log.Info(fmt.Sprintf("Docker CUDA validation stderr: %s", stderr), nil)
if err != nil {
return err
}
return nil
})

// Combine agent options from the parameters with the fakeintake and docker dependencies
params.agentOptions = append(params.agentOptions,
Expand All @@ -177,7 +193,7 @@ func gpuInstanceProvisioner(params *provisionerParams) provisioners.Provisioner
}

// validateGPUDevices checks that there are GPU devices present and accesible
func validateGPUDevices(e aws.Environment, vm *remote.Host) ([]pulumi.Resource, error) {
func validateGPUDevices(e aws.Environment, vm *componentsremote.Host) ([]pulumi.Resource, error) {
commands := map[string]string{
"pci": fmt.Sprintf("lspci -d %s:: | grep NVIDIA", nvidiaPCIVendorID),
"driver": "lsmod | grep nvidia",
Expand All @@ -203,7 +219,7 @@ func validateGPUDevices(e aws.Environment, vm *remote.Host) ([]pulumi.Resource,
return cmds, nil
}

func downloadDockerImages(e aws.Environment, vm *remote.Host, images []string, dependsOn ...pulumi.Resource) ([]pulumi.Resource, error) {
func downloadDockerImages(e aws.Environment, vm *componentsremote.Host, images []string, dependsOn ...pulumi.Resource) ([]pulumi.Resource, error) {
var cmds []pulumi.Resource

for i, image := range images {
Expand All @@ -224,7 +240,7 @@ func downloadDockerImages(e aws.Environment, vm *remote.Host, images []string, d
return cmds, nil
}

func validateDockerCuda(e aws.Environment, vm *remote.Host, dependsOn ...pulumi.Resource) (pulumi.Resource, error) {
func validateDockerCuda(e aws.Environment, vm *componentsremote.Host, dependsOn ...pulumi.Resource) (*remote.Command, error) {
return vm.OS.Runner().Command(
e.CommonNamer().ResourceName("docker-cuda-validate"),
&command.Args{
Expand Down
Loading