Skip to content

Commit

Permalink
Add common attributes
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbelamaric committed Jun 14, 2024
1 parent cde999c commit 8da0e74
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 5,011 deletions.
5 changes: 5 additions & 0 deletions dra-evolution/pkg/api/capacity_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ type ResourcePoolSpec struct {
// +optional
SharedCapacity []SharedCapacity `json:"sharedCapacity,omitempty"`

// Attributes contains common device attributes that are the same
// for all devices in the pool, unless a device specifically over
// writes it by defining an attribute of the same name.
Attributes []DeviceAttribute `json:"attributes,omitempty"`

// Devices lists all available devices in this pool.
//
// Must not have more than 128 entries.
Expand Down
35 changes: 32 additions & 3 deletions dra-evolution/pkg/gen/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,23 @@ import (
)

func dgxa100Pool(nodeName, poolName string, gpus int) (*api.ResourcePool, error) {
commonAttrs := map[string]bool{
"product-name": true,
"brand": true,
"architecture": true,
"cuda-compute-capability": true,
"driver-version": true,
"cuda-driver-version": true,
}

// Instantiate an instance of a mock dgxa100 server and build a nvDeviceLib
// from it. The nvDeviceLib is then used to populate the list of allocatable
// devices from this mock server using standard NVML calls.
l := nvdevicelib.New(dgxa100.New())

var devices []api.Device
var shared []api.SharedCapacity
var common []api.DeviceAttribute
for gpu := 0; gpu < gpus; gpu++ {
// Get the full list of allocatable devices from this GPU on the server
allocatable, err := l.GetPerGpuAllocatableDevices(gpu)
Expand All @@ -41,7 +51,18 @@ func dgxa100Pool(nodeName, poolName string, gpus int) (*api.ResourcePool, error)

shared = append(shared, sharedGroupToResources(model.NamedResources.SharedLimits[0], gpu)...)
for _, instance := range model.NamedResources.Instances {
devices = append(devices, instanceToDevice(instance, gpu))
if common == nil {
attrs := attributesToDeviceAttributes(instance.Attributes)
for k := range commonAttrs {
for _, attr := range attrs {
if attr.Name == k {
common = append(common, attr)
}
}
}

}
devices = append(devices, instanceToDevice(instance, gpu, commonAttrs))
}
}

Expand All @@ -57,15 +78,23 @@ func dgxa100Pool(nodeName, poolName string, gpus int) (*api.ResourcePool, error)
NodeName: nodeName,
DriverName: "gpu.nvidia.com/dra",
SharedCapacity: shared,
Attributes: common,
Devices: devices,
},
}, nil
}

func instanceToDevice(instance newresourceapi.NamedResourcesInstance, gpu int) api.Device {
func instanceToDevice(instance newresourceapi.NamedResourcesInstance, gpu int, commonAttrs map[string]bool) api.Device {
var attrs []api.DeviceAttribute
for _, attr := range attributesToDeviceAttributes(instance.Attributes) {
if _, ok := commonAttrs[attr.Name]; ok {
continue
}
attrs = append(attrs, attr)
}
device := api.Device{
Name: instance.Name,
Attributes: attributesToDeviceAttributes(instance.Attributes),
Attributes: attrs,
}

if len(instance.Resources) > 0 {
Expand Down
Loading

0 comments on commit 8da0e74

Please sign in to comment.