Skip to content

Commit

Permalink
add support for GPUs on GCP
Browse files Browse the repository at this point in the history
  • Loading branch information
SamuelStuchly committed Sep 10, 2021
1 parent a02074e commit f2cc121
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 18 deletions.
39 changes: 26 additions & 13 deletions pkg/apis/gcpprovider/v1beta1/gcpmachineproviderconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,26 @@ type GCPMachineProviderSpec struct {
// CredentialsSecret is a reference to the secret with GCP credentials.
CredentialsSecret *corev1.LocalObjectReference `json:"credentialsSecret,omitempty"`

CanIPForward bool `json:"canIPForward"`
DeletionProtection bool `json:"deletionProtection"`
Disks []*GCPDisk `json:"disks,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Metadata []*GCPMetadata `json:"gcpMetadata,omitempty"`
NetworkInterfaces []*GCPNetworkInterface `json:"networkInterfaces,omitempty"`
ServiceAccounts []GCPServiceAccount `json:"serviceAccounts"`
Tags []string `json:"tags,omitempty"`
TargetPools []string `json:"targetPools,omitempty"`
MachineType string `json:"machineType"`
Region string `json:"region"`
Zone string `json:"zone"`
ProjectID string `json:"projectID,omitempty"`
CanIPForward bool `json:"canIPForward"`
DeletionProtection bool `json:"deletionProtection"`
Disks []*GCPDisk `json:"disks,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Metadata []*GCPMetadata `json:"gcpMetadata,omitempty"`
NetworkInterfaces []*GCPNetworkInterface `json:"networkInterfaces,omitempty"`
ServiceAccounts []GCPServiceAccount `json:"serviceAccounts"`
Tags []string `json:"tags,omitempty"`
TargetPools []string `json:"targetPools,omitempty"`
MachineType string `json:"machineType"`
Region string `json:"region"`
Zone string `json:"zone"`
ProjectID string `json:"projectID,omitempty"`
GuestAccelerators []*GCPAcceleratorConfig `json:"guestAccelerators,omitempty"`

// Preemptible indicates if created instance is preemptible
Preemptible bool `json:"preemptible,omitempty"`

OnHostMaintenance string `json:"onHostMaintenance,omitempty"`
AutomaticRestart *bool `json:"automaticRestart,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down Expand Up @@ -104,3 +108,12 @@ type GCPKMSKeyReference struct {
// Location is the GCP location in which the Key Ring exists.
Location string `json:"location"`
}

// GCPAcceleratorConfig describes type and count of accelerator cards attached to the instance on GCP.
type GCPAcceleratorConfig struct {
// AcceleratorCount is number of AcceleratorType accelerators (GPUs) to be attached to an instance
AcceleratorCount int64 `json:"acceleratorCount,omitempty"`
// AcceleratorType is the type of accelerator (GPU) to be attached to an instance.
// Supported accelerator types are: nvidia-tesla-k80, nvidia-tesla-p100, nvidia-tesla-v100, nvidia-tesla-a100, nvidia-tesla-p4, nvidia-tesla-t4
AcceleratorType string `json:"acceleratorType,omitempty"`
}
31 changes: 31 additions & 0 deletions pkg/apis/gcpprovider/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

121 changes: 116 additions & 5 deletions pkg/cloud/gcp/actuators/machine/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ package machine
import (
"context"
"fmt"
"time"

"strconv"
"strings"
"time"

"github.com/openshift/cluster-api-provider-gcp/pkg/apis/gcpprovider/v1beta1"
machinev1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
Expand All @@ -25,6 +24,8 @@ const (
requeueAfterSeconds = 20
instanceLinkFmt = "https://www.googleapis.com/compute/v1/projects/%s/zones/%s/instances/%s"
kmsKeyNameFmt = "projects/%s/locations/%s/keyRings/%s/cryptoKeys/%s"
machineTypeFmt = "zones/%s/machineTypes/%s"
acceleratorTypeFmt = "zones/%s/acceleratorTypes/%s"
)

// Reconciler are list of services required by machine actuator, easy to create a fake
Expand All @@ -39,6 +40,97 @@ func newReconciler(scope *machineScope) *Reconciler {
}
}

var (
supportedGpuTypes = map[string]string{
"nvidia-tesla-k80": "NVIDIA_K80_GPUS",
"nvidia-tesla-p100": "NVIDIA_P100_GPUS",
"nvidia-tesla-v100": "NVIDIA_V100_GPUS",
"nvidia-tesla-a100": "NVIDIA_A100_GPUS",
"nvidia-tesla-p4": "NVIDIA_P4_GPUS",
"nvidia-tesla-t4": "NVIDIA_T4_GPUS",
}
)

func containsString(sli []string, str string) bool {
for _, elem := range sli {
if elem == str {
return true
}
}
return false
}

// machineTypeAcceleratorCount represents nvidia-tesla-A100 GPUs which are only compatible with A2 machine family
func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error {
region, err := r.computeService.RegionGet(r.projectID, r.providerSpec.Region)
if err != nil {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Failed to get region %s via compute service: %v", r.providerSpec.Region, err))
}
quotas := region.Quotas
var guestAccelerators = []*v1beta1.GCPAcceleratorConfig{}
// When the machine type has associated accelerator instances (A2 machine family), accelerators will be nvidia-tesla-A100s.
// Additional guest accelerators are not allowed so ignore the providerSpec GuestAccelerators.
if machineTypeAcceleratorCount != 0 {
guestAccelerators = append(guestAccelerators, &v1beta1.GCPAcceleratorConfig{AcceleratorType: "nvidia-tesla-a100", AcceleratorCount: machineTypeAcceleratorCount})
} else {
guestAccelerators = r.providerSpec.GuestAccelerators
}
// validate zone and then quota
// guestAccelerators slice can not store more than 1 element.
// More than one accelerator included in request results in error -> googleapi: Error 413: Value for field 'resource.guestAccelerators' is too large: maximum size 1 element(s); actual size 2., fieldSizeTooLarge
accelerator := guestAccelerators[0]
_, err = r.computeService.AcceleratorTypeGet(r.projectID, r.providerSpec.Zone, accelerator.AcceleratorType)
if err != nil {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("AcceleratorType %s not available in the zone %s : %v", accelerator.AcceleratorType, r.providerSpec.Zone, err))
}
metric := supportedGpuTypes[accelerator.AcceleratorType]
if metric == "" {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Unsupported accelerator type %s", accelerator.AcceleratorType))
}
// preemptible instances have separate quota
if r.providerSpec.Preemptible {
metric = "PREEMPTIBLE_" + metric
}
// check quota for GA
for i, q := range quotas {
if q.Metric == metric {
if int64(q.Usage)+accelerator.AcceleratorCount > int64(q.Limit) {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Quota exceeded. Metric: %s. Usage: %v. Limit: %v.", metric, q.Usage, q.Limit))
}
break
}
if i == len(quotas)-1 {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("No quota found. Metric: %s.", metric))
}
}
return nil
}

func (r *Reconciler) validateGuestAccelerators() error {
if len(r.providerSpec.GuestAccelerators) != 0 || strings.HasPrefix(r.providerSpec.MachineType, "a2-") {
if !strings.HasPrefix(r.providerSpec.MachineType, "n1-") {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s does not support accelerators. Only A2 and N1 machine type families support guest acceleartors.", r.providerSpec.MachineType))
}
a2MachineFamily, n1MachineFamily := r.computeService.GPUCompatibleMachineTypesList(r.providerSpec.ProjectID, r.providerSpec.Zone, r.Context)
machineType := r.providerSpec.MachineType
switch {
case a2MachineFamily[machineType] != 0:
// a2 family machine - has fixed type and count of GPUs
return r.checkQuota(a2MachineFamily[machineType])
case containsString(n1MachineFamily, machineType):
// n1 family machine
return r.checkQuota(0)
default:
// any other machine type
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s is not available in the zone %s.", r.providerSpec.MachineType, r.providerSpec.Zone))

}
} else {
// no accelerators to validate so return nil
return nil
}
}

// Create creates machine if and only if machine exists, handled by cluster-api
func (r *Reconciler) create() error {
if err := validateMachine(*r.machine, *r.providerSpec); err != nil {
Expand All @@ -50,16 +142,35 @@ func (r *Reconciler) create() error {
CanIpForward: r.providerSpec.CanIPForward,
DeletionProtection: r.providerSpec.DeletionProtection,
Labels: r.providerSpec.Labels,
MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", zone, r.providerSpec.MachineType),
MachineType: fmt.Sprintf(machineTypeFmt, zone, r.providerSpec.MachineType),
Name: r.machine.Name,
Tags: &compute.Tags{
Items: r.providerSpec.Tags,
},
Scheduling: &compute.Scheduling{
Preemptible: r.providerSpec.Preemptible,
Preemptible: r.providerSpec.Preemptible,
AutomaticRestart: r.providerSpec.AutomaticRestart,
OnHostMaintenance: r.providerSpec.OnHostMaintenance,
},
}

var guestAccelerators = []*compute.AcceleratorConfig{}

if l := len(r.providerSpec.GuestAccelerators); l == 1 {
guestAccelerators = append(guestAccelerators, &compute.AcceleratorConfig{
AcceleratorType: fmt.Sprintf(acceleratorTypeFmt, zone, r.providerSpec.GuestAccelerators[0].AcceleratorType),
AcceleratorCount: r.providerSpec.GuestAccelerators[0].AcceleratorCount,
})
} else if l > 1 {
return machinecontroller.InvalidMachineConfiguration("More than one type of accelerator provided. Instances support only one accelerator type at a time.")
}

instance.GuestAccelerators = guestAccelerators

if err := r.validateGuestAccelerators(); err != nil {
return err
}

if instance.Labels == nil {
instance.Labels = map[string]string{}
}
Expand All @@ -70,7 +181,7 @@ func (r *Reconciler) create() error {
for _, disk := range r.providerSpec.Disks {
srcImage := disk.Image
if !strings.Contains(disk.Image, "/") {
// only image name provided therfore defaulting to the current project
// only image name provided therefore defaulting to the current project
srcImage = googleapi.ResolveRelative(r.computeService.BasePath(), fmt.Sprintf("%s/global/images/%s", r.projectID, disk.Image))
}

Expand Down
37 changes: 37 additions & 0 deletions pkg/cloud/gcp/actuators/services/compute/computeservice.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package computeservice

import (
"context"
"log"
"strings"

"github.com/openshift/cluster-api-provider-gcp/pkg/cloud/gcp/actuators/util"
"github.com/openshift/cluster-api-provider-gcp/pkg/version"
"google.golang.org/api/compute/v1"
Expand All @@ -19,6 +23,9 @@ type GCPComputeService interface {
TargetPoolsAddInstance(project string, region string, name string, instance string) (*compute.Operation, error)
TargetPoolsRemoveInstance(project string, region string, name string, instance string) (*compute.Operation, error)
MachineTypesGet(project string, machineType string, zone string) (*compute.MachineType, error)
RegionGet(project string, region string) (*compute.Region, error)
GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string)
AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error)
}

type computeService struct {
Expand Down Expand Up @@ -101,3 +108,33 @@ func (c *computeService) TargetPoolsRemoveInstance(project string, region string
func (c *computeService) MachineTypesGet(project string, zone string, machineType string) (*compute.MachineType, error) {
return c.service.MachineTypes.Get(project, zone, machineType).Do()
}

// GPUCompatibleMachineTypesList function lists machineTypes available in the zone and return map of A2 family and slice of N1 family machineTypes
func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
req := c.service.MachineTypes.List(project, zone)
var (
a2MachineFamily = map[string]int64{}
n1MachineFamily []string
)
if err := req.Pages(ctx, func(page *compute.MachineTypeList) error {
for _, machineType := range page.Items {
if strings.HasPrefix(machineType.Name, "a2") {
a2MachineFamily[machineType.Name] = machineType.Accelerators[0].GuestAcceleratorCount
} else if strings.HasPrefix(machineType.Name, "n1") {
n1MachineFamily = append(n1MachineFamily, machineType.Name)
}
}
return nil
}); err != nil {
log.Fatal(err)
}
return a2MachineFamily, n1MachineFamily
}

func (c *computeService) AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) {
return c.service.AcceleratorTypes.Get(project, zone, acceleratorType).Do()
}

func (c *computeService) RegionGet(project string, region string) (*compute.Region, error) {
return c.service.Regions.Get(project, region).Do()
}
13 changes: 13 additions & 0 deletions pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package computeservice

import (
"context"

compute "google.golang.org/api/compute/v1"
"google.golang.org/api/googleapi"
)
Expand Down Expand Up @@ -129,3 +131,14 @@ func MockBuilderFuncTypeNotFound(serviceAccountJSON string) (GCPComputeService,
}
return computeSvc, nil
}

func (c *GCPComputeServiceMock) RegionGet(project string, region string) (*compute.Region, error) {
return nil, nil
}

func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
return nil, nil
}
func (c *GCPComputeServiceMock) AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) {
return nil, nil
}

0 comments on commit f2cc121

Please sign in to comment.