From 42e3dcdcc40c19a89a46b28ebf1ac862d3ebd411 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 21 Jun 2021 11:25:34 -0400 Subject: [PATCH] devices: externalize nvidia device driver --- .circleci/config.yml | 6 +- devices/gpu/nvidia/README.md | 21 - devices/gpu/nvidia/cmd/main.go | 20 - devices/gpu/nvidia/device.go | 228 - devices/gpu/nvidia/device_test.go | 140 - devices/gpu/nvidia/fingerprint.go | 229 - devices/gpu/nvidia/fingerprint_test.go | 1361 ---- devices/gpu/nvidia/nvml/client.go | 194 - devices/gpu/nvidia/nvml/client_test.go | 399 -- devices/gpu/nvidia/nvml/driver_default.go | 33 - devices/gpu/nvidia/nvml/driver_linux.go | 85 - devices/gpu/nvidia/nvml/shared.go | 61 - devices/gpu/nvidia/stats.go | 325 - devices/gpu/nvidia/stats_test.go | 3041 --------- go.mod | 1 - go.sum | 2 - .../catalog/register_nvidia_linux.go | 14 - .../NVIDIA/gpu-monitoring-tools/LICENSE | 29 - .../bindings/go/nvml/bindings.go | 634 -- .../bindings/go/nvml/nvml.go | 533 -- .../bindings/go/nvml/nvml.h | 5871 ----------------- .../bindings/go/nvml/nvml_dl.c | 46 - .../bindings/go/nvml/nvml_dl.h | 15 - vendor/modules.txt | 3 - .../content/docs/devices/external/index.mdx | 31 +- .../docs/devices/{ => external}/nvidia.mdx | 0 website/content/docs/devices/index.mdx | 15 +- website/data/docs-nav-data.json | 12 +- 28 files changed, 27 insertions(+), 13322 deletions(-) delete mode 100644 devices/gpu/nvidia/README.md delete mode 100644 devices/gpu/nvidia/cmd/main.go delete mode 100644 devices/gpu/nvidia/device.go delete mode 100644 devices/gpu/nvidia/device_test.go delete mode 100644 devices/gpu/nvidia/fingerprint.go delete mode 100644 devices/gpu/nvidia/fingerprint_test.go delete mode 100644 devices/gpu/nvidia/nvml/client.go delete mode 100644 devices/gpu/nvidia/nvml/client_test.go delete mode 100644 devices/gpu/nvidia/nvml/driver_default.go delete mode 100644 devices/gpu/nvidia/nvml/driver_linux.go delete mode 100644 devices/gpu/nvidia/nvml/shared.go delete mode 100644 devices/gpu/nvidia/stats.go delete mode 100644 devices/gpu/nvidia/stats_test.go delete mode 100644 helper/pluginutils/catalog/register_nvidia_linux.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h rename website/content/docs/devices/{ => external}/nvidia.mdx (100%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0a4cec135dc..0fcf418656d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -620,13 +620,9 @@ workflows: test_module: "api" filters: *backend_test_branches_filter enable_race_testing: true - - test-container: - name: "test-devices" - test_packages: "./devices/..." - filters: *backend_test_branches_filter - test-machine: name: "test-other" - exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./devices|./e2e" + exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./e2e" filters: *backend_test_branches_filter - test-machine: name: "test-docker" diff --git a/devices/gpu/nvidia/README.md b/devices/gpu/nvidia/README.md deleted file mode 100644 index 1035c7c8940..00000000000 --- a/devices/gpu/nvidia/README.md +++ /dev/null @@ -1,21 +0,0 @@ -This package provides an implementation of nvidia device plugin - -# Behavior - -Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period. - -# Config - -The configuration should be passed via an HCL file that begins with a top level `config` stanza: - -``` -config { - ignored_gpu_ids = ["uuid1", "uuid2"] - fingerprint_period = "5s" -} -``` - -The valid configuration options are: - -* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad -* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes. diff --git a/devices/gpu/nvidia/cmd/main.go b/devices/gpu/nvidia/cmd/main.go deleted file mode 100644 index 5c0bea6c4d8..00000000000 --- a/devices/gpu/nvidia/cmd/main.go +++ /dev/null @@ -1,20 +0,0 @@ -package main - -import ( - "context" - - log "github.com/hashicorp/go-hclog" - - "github.com/hashicorp/nomad/devices/gpu/nvidia" - "github.com/hashicorp/nomad/plugins" -) - -func main() { - // Serve the plugin - plugins.ServeCtx(factory) -} - -// factory returns a new instance of the Nvidia GPU plugin -func factory(ctx context.Context, log log.Logger) interface{} { - return nvidia.NewNvidiaDevice(ctx, log) -} diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go deleted file mode 100644 index 67680dc2a0e..00000000000 --- a/devices/gpu/nvidia/device.go +++ /dev/null @@ -1,228 +0,0 @@ -package nvidia - -import ( - "context" - "fmt" - "strings" - "sync" - "time" - - log "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper/pluginutils/loader" - "github.com/hashicorp/nomad/plugins/base" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/hclspec" -) - -const ( - // pluginName is the name of the plugin - pluginName = "nvidia-gpu" - - // vendor is the vendor providing the devices - vendor = "nvidia" - - // deviceType is the type of device being returned - deviceType = device.DeviceTypeGPU - - // notAvailable value is returned to nomad server in case some properties were - // undetected by nvml driver - notAvailable = "N/A" - - // Nvidia-container-runtime environment variable names - NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" -) - -var ( - // PluginID is the nvidia plugin metadata registered in the plugin - // catalog. - PluginID = loader.PluginID{ - Name: pluginName, - PluginType: base.PluginTypeDevice, - } - - // PluginConfig is the nvidia factory function registered in the - // plugin catalog. - PluginConfig = &loader.InternalPluginConfig{ - Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) }, - } - - // pluginInfo describes the plugin - pluginInfo = &base.PluginInfoResponse{ - Type: base.PluginTypeDevice, - PluginApiVersions: []string{device.ApiVersion010}, - PluginVersion: "0.1.0", - Name: pluginName, - } - - // configSpec is the specification of the plugin's configuration - configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ - "enabled": hclspec.NewDefault( - hclspec.NewAttr("enabled", "bool", false), - hclspec.NewLiteral("true"), - ), - "ignored_gpu_ids": hclspec.NewDefault( - hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), - hclspec.NewLiteral("[]"), - ), - "fingerprint_period": hclspec.NewDefault( - hclspec.NewAttr("fingerprint_period", "string", false), - hclspec.NewLiteral("\"1m\""), - ), - }) -) - -// Config contains configuration information for the plugin. -type Config struct { - Enabled bool `codec:"enabled"` - IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` - FingerprintPeriod string `codec:"fingerprint_period"` -} - -// NvidiaDevice contains all plugin specific data -type NvidiaDevice struct { - // enabled indicates whether the plugin should be enabled - enabled bool - - // nvmlClient is used to get data from nvidia - nvmlClient nvml.NvmlClient - - // initErr holds an error retrieved during - // nvmlClient initialization - initErr error - - // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad - ignoredGPUIDs map[string]struct{} - - // fingerprintPeriod is how often we should call nvml to get list of devices - fingerprintPeriod time.Duration - - // devices is the set of detected eligible devices - devices map[string]struct{} - deviceLock sync.RWMutex - - logger log.Logger -} - -// NewNvidiaDevice returns a new nvidia device plugin. -func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice { - nvmlClient, err := nvml.NewNvmlClient() - logger := log.Named(pluginName) - if err != nil && err.Error() != nvml.UnavailableLib.Error() { - logger.Error("unable to initialize Nvidia driver", "reason", err) - } - return &NvidiaDevice{ - logger: logger, - devices: make(map[string]struct{}), - ignoredGPUIDs: make(map[string]struct{}), - nvmlClient: nvmlClient, - initErr: err, - } -} - -// PluginInfo returns information describing the plugin. -func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { - return pluginInfo, nil -} - -// ConfigSchema returns the plugins configuration schema. -func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { - return configSpec, nil -} - -// SetConfig is used to set the configuration of the plugin. -func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { - var config Config - if len(cfg.PluginConfig) != 0 { - if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil { - return err - } - } - - d.enabled = config.Enabled - - for _, ignoredGPUId := range config.IgnoredGPUIDs { - d.ignoredGPUIDs[ignoredGPUId] = struct{}{} - } - - period, err := time.ParseDuration(config.FingerprintPeriod) - if err != nil { - return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) - } - d.fingerprintPeriod = period - - return nil -} - -// Fingerprint streams detected devices. If device changes are detected or the -// devices health changes, messages will be emitted. -func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.FingerprintResponse) - go d.fingerprint(ctx, outCh) - return outCh, nil -} - -type reservationError struct { - notExistingIDs []string -} - -func (e *reservationError) Error() string { - return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) -} - -// Reserve returns information on how to mount given devices. -// Assumption is made that nomad server is responsible for correctness of -// GPU allocations, handling tricky cases such as double-allocation of single GPU -func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { - if len(deviceIDs) == 0 { - return &device.ContainerReservation{}, nil - } - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - // Due to the asynchronous nature of NvidiaPlugin, there is a possibility - // of race condition - // - // Timeline: - // 1 - fingerprint reports that GPU with id "1" is present - // 2 - the following events happen at the same time: - // a) server decides to allocate GPU with id "1" - // b) fingerprint check reports that GPU with id "1" is no more present - // - // The latest and always valid version of fingerprinted ids are stored in - // d.devices map. To avoid this race condition an error is returned if - // any of provided deviceIDs is not found in d.devices map - d.deviceLock.RLock() - var notExistingIDs []string - for _, id := range deviceIDs { - if _, deviceIDExists := d.devices[id]; !deviceIDExists { - notExistingIDs = append(notExistingIDs, id) - } - } - d.deviceLock.RUnlock() - if len(notExistingIDs) != 0 { - return nil, &reservationError{notExistingIDs} - } - - return &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: strings.Join(deviceIDs, ","), - }, - }, nil -} - -// Stats streams statistics for the detected devices. -func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.StatsResponse) - go d.stats(ctx, outCh, interval) - return outCh, nil -} diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go deleted file mode 100644 index a5ec354e243..00000000000 --- a/devices/gpu/nvidia/device_test.go +++ /dev/null @@ -1,140 +0,0 @@ -package nvidia - -import ( - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/plugins/device" - "github.com/stretchr/testify/require" -) - -type MockNvmlClient struct { - FingerprintError error - FingerprintResponseReturned *nvml.FingerprintData - - StatsError error - StatsResponseReturned []*nvml.StatsData -} - -func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) { - return c.FingerprintResponseReturned, c.FingerprintError -} - -func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { - return c.StatsResponseReturned, c.StatsError -} - -func TestReserve(t *testing.T) { - cases := []struct { - Name string - ExpectedReservation *device.ContainerReservation - ExpectedError error - Device *NvidiaDevice - RequestedIDs []string - }{ - { - Name: "All RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - "UUID3", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Some RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "All RequestedIDs are managed by Device", - ExpectedReservation: &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: "UUID1,UUID2,UUID3", - }, - }, - ExpectedError: nil, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "No IDs requested", - ExpectedReservation: &device.ContainerReservation{}, - ExpectedError: nil, - RequestedIDs: nil, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Device is disabled", - ExpectedReservation: nil, - ExpectedError: device.ErrPluginDisabled, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: false, - }, - }, - } - - for _, c := range cases { - t.Run(c.Name, func(t *testing.T) { - actualReservation, actualError := c.Device.Reserve(c.RequestedIDs) - require.Equal(t, c.ExpectedReservation, actualReservation) - require.Equal(t, c.ExpectedError, actualError) - }) - } -} diff --git a/devices/gpu/nvidia/fingerprint.go b/devices/gpu/nvidia/fingerprint.go deleted file mode 100644 index 45bb34fa335..00000000000 --- a/devices/gpu/nvidia/fingerprint.go +++ /dev/null @@ -1,229 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names and units for reporting Fingerprint output - MemoryAttr = "memory" - PowerAttr = "power" - BAR1Attr = "bar1" - DriverVersionAttr = "driver_version" - CoresClockAttr = "cores_clock" - MemoryClockAttr = "memory_clock" - PCIBandwidthAttr = "pci_bandwidth" - DisplayStateAttr = "display_state" - PersistenceModeAttr = "persistence_mode" -) - -// fingerprint is the long running goroutine that detects hardware -func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { - defer close(devices) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr) - devices <- device.NewFingerprintError(d.initErr) - } - - // Just close the channel to let server know that there are no working - // Nvidia GPU units - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(d.fingerprintPeriod) - } - d.writeFingerprintToChannel(devices) - } -} - -// writeFingerprintToChannel makes nvml call and writes response to channel -func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { - fingerprintData, err := d.nvmlClient.GetFingerprintData() - if err != nil { - d.logger.Error("failed to get fingerprint nvidia devices", "error", err) - devices <- device.NewFingerprintError(err) - return - } - - // ignore devices from fingerprint output - fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs) - // check if any device health was updated or any device was added to host - if !d.fingerprintChanged(fingerprintDevices) { - return - } - - commonAttributes := map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr(fingerprintData.DriverVersion), - }, - } - - // Group all FingerprintDevices by DeviceName attribute - deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData) - for _, device := range fingerprintDevices { - deviceName := device.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device) - } - - // Build Fingerprint response with computed groups and send it over the channel - deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName)) - for groupName, devices := range deviceListByDeviceName { - deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes)) - } - devices <- device.NewFingerprint(deviceGroups...) -} - -// ignoreFingerprintedDevices excludes ignored devices from fingerprint output -func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData { - var result []*nvml.FingerprintDeviceData - for _, fingerprintDevice := range deviceData { - if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored { - result = append(result, fingerprintDevice) - } - } - return result -} - -// fingerprintChanged checks if there are any previously unseen nvidia devices located -// or any of fingerprinted nvidia devices disappeared since the last fingerprint run. -// Also, this func updates device map on NvidiaDevice with the latest data -func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool { - d.deviceLock.Lock() - defer d.deviceLock.Unlock() - - changeDetected := false - // check if every device in allDevices is in d.devices - for _, device := range allDevices { - if _, ok := d.devices[device.UUID]; !ok { - changeDetected = true - } - } - - // check if every device in d.devices is in allDevices - fingerprintDeviceMap := make(map[string]struct{}) - for _, device := range allDevices { - fingerprintDeviceMap[device.UUID] = struct{}{} - } - for id := range d.devices { - if _, ok := fingerprintDeviceMap[id]; !ok { - changeDetected = true - } - } - - d.devices = fingerprintDeviceMap - return changeDetected -} - -// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice -func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup { - // deviceGroup without devices makes no sense -> return nil when no devices are provided - if len(deviceList) == 0 { - return nil - } - - devices := make([]*device.Device, len(deviceList)) - for index, dev := range deviceList { - devices[index] = &device.Device{ - ID: dev.UUID, - // all fingerprinted devices are "healthy" for now - // to get real health data -> dcgm bindings should be used - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: dev.PCIBusID, - }, - } - } - - deviceGroup := &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - Devices: devices, - // Assumption made that devices with the same DeviceName have the same - // attributes like amount of memory, power, bar1memory etc - Attributes: attributesFromFingerprintDeviceData(deviceList[0]), - } - - // Extend attribute map with common attributes - for attributeKey, attributeValue := range commonAttributes { - deviceGroup.Attributes[attributeKey] = attributeValue - } - - return deviceGroup -} - -// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData -// struct to device.DeviceGroup.Attributes format (map[string]string) -// this function performs all nil checks for FingerprintDeviceData pointers -func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute { - attrs := map[string]*structs.Attribute{ - DisplayStateAttr: { - String: helper.StringToPtr(d.DisplayState), - }, - PersistenceModeAttr: { - String: helper.StringToPtr(d.PersistenceMode), - }, - } - - if d.MemoryMiB != nil { - attrs[MemoryAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryMiB)), - Unit: structs.UnitMiB, - } - } - if d.PowerW != nil { - attrs[PowerAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PowerW)), - Unit: structs.UnitW, - } - } - if d.BAR1MiB != nil { - attrs[BAR1Attr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.BAR1MiB)), - Unit: structs.UnitMiB, - } - } - if d.CoresClockMHz != nil { - attrs[CoresClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.MemoryClockMHz != nil { - attrs[MemoryClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.PCIBandwidthMBPerS != nil { - attrs[PCIBandwidthAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)), - Unit: structs.UnitMBPerS, - } - } - - return attrs -} diff --git a/devices/gpu/nvidia/fingerprint_test.go b/devices/gpu/nvidia/fingerprint_test.go deleted file mode 100644 index c85b5c8c90a..00000000000 --- a/devices/gpu/nvidia/fingerprint_test.go +++ /dev/null @@ -1,1361 +0,0 @@ -package nvidia - -import ( - "context" - "errors" - "sort" - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestIgnoreFingerprintedDevices(t *testing.T) { - for _, testCase := range []struct { - Name string - DeviceData []*nvml.FingerprintDeviceData - IgnoredGPUIds map[string]struct{} - ExpectedResult []*nvml.FingerprintDeviceData - }{ - { - Name: "Odd ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "Even ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "All ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - { - Name: "No ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{}, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "No DeviceData provided", - DeviceData: nil, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestCheckFingerprintUpdates(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - AllDevices []*nvml.FingerprintDeviceData - DeviceMapAfterMethodCall map[string]struct{} - ExpectedResult bool - }{ - { - Name: "No updates", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: false, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "New Device Appeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "I am new", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - "I am new": {}, - }, - }, - { - Name: "Device disappeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - }, - }, - { - Name: "No devices in NvidiaDevice map", - Device: &NvidiaDevice{}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "No devices detected", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: nil, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{}, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) - req := require.New(t) - // check that function returns valid "updated / not updated" state - req.Equal(testCase.ExpectedResult, actualResult) - // check that function propely updates devices map - req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) - }) - } -} - -func TestAttributesFromFingerprintDeviceData(t *testing.T) { - for _, testCase := range []struct { - Name string - FingerprintDeviceData *nvml.FingerprintDeviceData - ExpectedResult map[string]*structs.Attribute - }{ - { - Name: "All attributes are not nil", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(256), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - { - Name: "nil values are omitted", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) - require.Equal(t, testCase.ExpectedResult, actualResult) - }) - } -} - -func TestDeviceGroupFromFingerprintData(t *testing.T) { - for _, testCase := range []struct { - Name string - GroupName string - Devices []*nvml.FingerprintDeviceData - CommonAttributes map[string]*structs.Attribute - ExpectedResult *device.DeviceGroup - }{ - { - Name: "Devices are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - }, - { - Name: "Devices and common attributes are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - { - Name: "Devices are not provided", - GroupName: "Type1", - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - Devices: nil, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestWriteFingerprintToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that FingerprintError is handled properly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New(""), - }, - }, - { - Name: "Check ignore devices works correctly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - ignoredGPUIDs: map[string]struct{}{ - "1": {}, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 1", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name3"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name3", - Devices: []*device.Device{ - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(12), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 2", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - channel := make(chan *device.FingerprintResponse, 1) - testCase.Device.writeFingerprintToChannel(channel) - actualResult := <-channel - // writeFingerprintToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedResult arrays has to be sorted firsted - sort.Slice(actualResult.Devices, func(i, j int) bool { - return actualResult.Devices[i].Name < actualResult.Devices[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name - }) - require.Equal(t, testCase.ExpectedWriteToChannel, actualResult) - }) - } -} - -// Test if nonworking driver returns empty fingerprint data -func TestFingerprint(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that working driver returns valid fingeprint data", - Device: &NvidiaDevice{ - initErr: nil, - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check that not working driver returns error fingeprint data", - Device: &NvidiaDevice{ - initErr: errors.New("foo"), - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New("foo"), - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - outCh := make(chan *device.FingerprintResponse) - ctx, cancel := context.WithCancel(context.Background()) - go testCase.Device.fingerprint(ctx, outCh) - result := <-outCh - cancel() - require.New(t).Equal(result, testCase.ExpectedWriteToChannel) - }) - } -} diff --git a/devices/gpu/nvidia/nvml/client.go b/devices/gpu/nvidia/nvml/client.go deleted file mode 100644 index d18dcbe1a9f..00000000000 --- a/devices/gpu/nvidia/nvml/client.go +++ /dev/null @@ -1,194 +0,0 @@ -package nvml - -import ( - "fmt" -) - -// DeviceData represents common fields for Nvidia device -type DeviceData struct { - UUID string - DeviceName *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 -} - -// FingerprintDeviceData is a superset of DeviceData -// it describes device specific fields returned from -// nvml queries during fingerprinting call -type FingerprintDeviceData struct { - *DeviceData - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint - DisplayState string - PersistenceMode string - PCIBusID string -} - -// FingerprintData represets attributes of driver/devices -type FingerprintData struct { - Devices []*FingerprintDeviceData - DriverVersion string -} - -// StatsData is a superset of DeviceData -// it represents statistics data returned for every Nvidia device -type StatsData struct { - *DeviceData - PowerUsageW *uint - GPUUtilization *uint - MemoryUtilization *uint - EncoderUtilization *uint - DecoderUtilization *uint - TemperatureC *uint - UsedMemoryMiB *uint64 - BAR1UsedMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} - -// NvmlClient describes how users would use nvml library -type NvmlClient interface { - GetFingerprintData() (*FingerprintData, error) - GetStatsData() ([]*StatsData, error) -} - -// nvmlClient implements NvmlClient -// Users of this lib are expected to use this struct via NewNvmlClient func -type nvmlClient struct { - driver NvmlDriver -} - -// NewNvmlClient function creates new nvmlClient with real -// NvmlDriver implementation. Also, this func initializes NvmlDriver -func NewNvmlClient() (*nvmlClient, error) { - driver := &nvmlDriver{} - err := driver.Initialize() - if err != nil { - return nil, err - } - return &nvmlClient{ - driver: driver, - }, nil -} - -// GetFingerprintData returns FingerprintData for available Nvidia devices -func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { - /* - nvml fields to be fingerprinted # nvml_library_call - 1 - Driver Version # nvmlSystemGetDriverVersion - 2 - Product Name # nvmlDeviceGetName - 3 - GPU UUID # nvmlDeviceGetUUID - 4 - Total Memory # nvmlDeviceGetMemoryInfo - 5 - Power # nvmlDeviceGetPowerManagementLimit - 6 - PCIBusID # nvmlDeviceGetPciInfo - 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo( - 8 - PCI Bandwidth - 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo - 10 - Display Mode # nvmlDeviceGetDisplayMode - 11 - Persistence Mode # nvmlDeviceGetPersistenceMode - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - driverVersion, err := c.driver.SystemDriverVersion() - if err != nil { - return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) - } - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) - } - - allNvidiaGPUResources[i] = &FingerprintDeviceData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS, - CoresClockMHz: deviceInfo.CoresClockMHz, - MemoryClockMHz: deviceInfo.MemoryClockMHz, - DisplayState: deviceInfo.DisplayState, - PersistenceMode: deviceInfo.PersistenceMode, - PCIBusID: deviceInfo.PCIBusID, - } - } - return &FingerprintData{ - Devices: allNvidiaGPUResources, - DriverVersion: driverVersion, - }, nil -} - -// GetStatsData returns statistics data for all devices on this machine -func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { - /* - nvml fields to be reported to stats api # nvml_library_call - 1 - Used Memory # nvmlDeviceGetMemoryInfo - 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates - 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates - 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization - 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization - 6 - Current GPU Temperature # nvmlDeviceGetTemperature - 7 - Power Draw # nvmlDeviceGetPowerUsage - 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo - 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter - 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter - 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUStats := make([]*StatsData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) - } - - allNvidiaGPUStats[i] = &StatsData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PowerUsageW: deviceStatus.PowerUsageW, - GPUUtilization: deviceStatus.GPUUtilization, - MemoryUtilization: deviceStatus.MemoryUtilization, - EncoderUtilization: deviceStatus.EncoderUtilization, - DecoderUtilization: deviceStatus.DecoderUtilization, - TemperatureC: deviceStatus.TemperatureC, - UsedMemoryMiB: deviceStatus.UsedMemoryMiB, - BAR1UsedMiB: deviceStatus.BAR1UsedMiB, - ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, - ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, - ECCErrorsDevice: deviceStatus.ECCErrorsDevice, - } - } - return allNvidiaGPUStats, nil -} diff --git a/devices/gpu/nvidia/nvml/client_test.go b/devices/gpu/nvidia/nvml/client_test.go deleted file mode 100644 index 23731f7b052..00000000000 --- a/devices/gpu/nvidia/nvml/client_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package nvml - -import ( - "errors" - "testing" - - "github.com/hashicorp/nomad/helper" - "github.com/stretchr/testify/require" -) - -type MockNVMLDriver struct { - systemDriverCallSuccessful bool - deviceCountCallSuccessful bool - deviceInfoByIndexCallSuccessful bool - deviceInfoAndStatusByIndexCallSuccessful bool - driverVersion string - devices []*DeviceInfo - deviceStatus []*DeviceStatus -} - -func (m *MockNVMLDriver) Initialize() error { - return nil -} - -func (m *MockNVMLDriver) Shutdown() error { - return nil -} - -func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { - if !m.systemDriverCallSuccessful { - return "", errors.New("failed to get system driver") - } - return m.driverVersion, nil -} - -func (m *MockNVMLDriver) DeviceCount() (uint, error) { - if !m.deviceCountCallSuccessful { - return 0, errors.New("failed to get device length") - } - return uint(len(m.devices)), nil -} - -func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - if index >= uint(len(m.devices)) { - return nil, errors.New("index is out of range") - } - if !m.deviceInfoByIndexCallSuccessful { - return nil, errors.New("failed to get device info by index") - } - return m.devices[index], nil -} - -func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) { - return nil, nil, errors.New("index is out of range") - } - if !m.deviceInfoAndStatusByIndexCallSuccessful { - return nil, nil, errors.New("failed to get device info and status by index") - } - return m.devices[index], m.deviceStatus[index], nil -} - -func TestGetFingerprintDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult *FingerprintData - }{ - { - Name: "fail on systemDriverCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: false, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceInfoByIndexCall", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: &FingerprintData{ - DriverVersion: "driverVersion", - Devices: []*FingerprintDeviceData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - PCIBusID: "busId1", - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - PCIBusID: "busId2", - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - driverVersion: "driverVersion", - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - fingerprintData, err := cli.GetFingerprintData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, fingerprintData) - } -} - -func TestGetStatsDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult []*StatsData - }{ - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on DeviceInfoAndStatusByIndex call", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: []*StatsData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - DriverConfiguration: &MockNVMLDriver{ - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - statsData, err := cli.GetStatsData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, statsData) - } -} diff --git a/devices/gpu/nvidia/nvml/driver_default.go b/devices/gpu/nvidia/nvml/driver_default.go deleted file mode 100644 index e67efa22eea..00000000000 --- a/devices/gpu/nvidia/nvml/driver_default.go +++ /dev/null @@ -1,33 +0,0 @@ -// +build !linux - -package nvml - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return UnavailableLib -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return UnavailableLib -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return "", UnavailableLib -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return 0, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - return nil, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - return nil, nil, UnavailableLib -} diff --git a/devices/gpu/nvidia/nvml/driver_linux.go b/devices/gpu/nvidia/nvml/driver_linux.go deleted file mode 100644 index bdd777561bc..00000000000 --- a/devices/gpu/nvidia/nvml/driver_linux.go +++ /dev/null @@ -1,85 +0,0 @@ -package nvml - -import ( - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return nvml.Init() -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return nvml.Shutdown() -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return nvml.GetDriverVersion() -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return nvml.GetDeviceCount() -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, err - } - deviceMode, err := device.GetDeviceMode() - if err != nil { - return nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - DisplayState: deviceMode.DisplayInfo.Mode.String(), - PersistenceMode: deviceMode.Persistence.String(), - }, nil -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, nil, err - } - status, err := device.Status() - if err != nil { - return nil, nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - }, &DeviceStatus{ - TemperatureC: status.Temperature, - GPUUtilization: status.Utilization.GPU, - MemoryUtilization: status.Utilization.Memory, - EncoderUtilization: status.Utilization.Encoder, - DecoderUtilization: status.Utilization.Decoder, - UsedMemoryMiB: status.Memory.Global.Used, - ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache, - ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache, - ECCErrorsDevice: status.Memory.ECCErrors.Device, - PowerUsageW: status.Power, - BAR1UsedMiB: status.PCI.BAR1Used, - }, nil -} diff --git a/devices/gpu/nvidia/nvml/shared.go b/devices/gpu/nvidia/nvml/shared.go deleted file mode 100644 index a0bb04d2223..00000000000 --- a/devices/gpu/nvidia/nvml/shared.go +++ /dev/null @@ -1,61 +0,0 @@ -package nvml - -import "errors" - -var ( - // UnavailableLib is returned when the nvml library could not be loaded. - UnavailableLib = errors.New("could not load NVML library") -) - -// nvmlDriver implements NvmlDriver -// Users are required to call Initialize method before using any other methods -type nvmlDriver struct{} - -// NvmlDriver represents set of methods to query nvml library -type NvmlDriver interface { - Initialize() error - Shutdown() error - SystemDriverVersion() (string, error) - DeviceCount() (uint, error) - DeviceInfoByIndex(uint) (*DeviceInfo, error) - DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) -} - -// DeviceInfo represents nvml device data -// this struct is returned by NvmlDriver DeviceInfoByIndex and -// DeviceInfoAndStatusByIndex methods -type DeviceInfo struct { - // The following fields are guaranteed to be retrieved from nvml - UUID string - PCIBusID string - DisplayState string - PersistenceMode string - - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - Name *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint -} - -// DeviceStatus represents nvml device status -// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method -type DeviceStatus struct { - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - PowerUsageW *uint - TemperatureC *uint - GPUUtilization *uint // % - MemoryUtilization *uint // % - EncoderUtilization *uint // % - DecoderUtilization *uint // % - BAR1UsedMiB *uint64 - UsedMemoryMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} diff --git a/devices/gpu/nvidia/stats.go b/devices/gpu/nvidia/stats.go deleted file mode 100644 index c6c44775791..00000000000 --- a/devices/gpu/nvidia/stats.go +++ /dev/null @@ -1,325 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names for reporting stats output - PowerUsageAttr = "Power usage" - PowerUsageUnit = "W" - PowerUsageDesc = "Power usage for this GPU in watts and " + - "its associated circuitry (e.g. memory) / Maximum GPU Power" - GPUUtilizationAttr = "GPU utilization" - GPUUtilizationUnit = "%" - GPUUtilizationDesc = "Percent of time over the past sample period " + - "during which one or more kernels were executing on the GPU." - MemoryUtilizationAttr = "Memory utilization" - MemoryUtilizationUnit = "%" - MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period" - EncoderUtilizationAttr = "Encoder utilization" - EncoderUtilizationUnit = "%" - EncoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Encoder was used" - DecoderUtilizationAttr = "Decoder utilization" - DecoderUtilizationUnit = "%" - DecoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Decoder was used" - TemperatureAttr = "Temperature" - TemperatureUnit = "C" // Celsius degrees - TemperatureDesc = "Temperature of the Unit" - MemoryStateAttr = "Memory state" - MemoryStateUnit = "MiB" // Mebibytes - MemoryStateDesc = "UsedMemory / TotalMemory" - BAR1StateAttr = "BAR1 buffer state" - BAR1StateUnit = "MiB" // Mebibytes - BAR1StateDesc = "UsedBAR1 / TotalBAR1" - ECCErrorsL1CacheAttr = "ECC L1 errors" - ECCErrorsL1CacheUnit = "#" // number of errors - ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device" - ECCErrorsL2CacheAttr = "ECC L2 errors" - ECCErrorsL2CacheUnit = "#" // number of errors - ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device" - ECCErrorsDeviceAttr = "ECC memory errors" - ECCErrorsDeviceUnit = "#" // number of errors - ECCErrorsDeviceDesc = "Requested memory error counter for the device" -) - -// stats is the long running goroutine that streams device statistics -func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) { - defer close(stats) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr) - stats <- device.NewStatsError(d.initErr) - } - - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(interval) - } - - d.writeStatsToChannel(stats, time.Now()) - } -} - -// filterStatsByID accepts list of StatsData and set of IDs -// this function would return entries from StatsData with IDs found in the set -func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData { - var filteredStats []*nvml.StatsData - for _, statsItem := range stats { - if _, ok := ids[statsItem.UUID]; ok { - filteredStats = append(filteredStats, statsItem) - } - } - return filteredStats -} - -// writeStatsToChannel collects StatsData from NVML backend, groups StatsData -// by DeviceName attribute, populates DeviceGroupStats structure for every group -// and sends data over provided channel -func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) { - statsData, err := d.nvmlClient.GetStatsData() - if err != nil { - d.logger.Error("failed to get nvidia stats", "error", err) - stats <- &device.StatsResponse{ - Error: err, - } - return - } - - // filter only stats from devices that are stored in NvidiaDevice struct - d.deviceLock.RLock() - statsData = filterStatsByID(statsData, d.devices) - d.deviceLock.RUnlock() - - // group stats by DeviceName struct field - statsListByDeviceName := make(map[string][]*nvml.StatsData) - for _, statsItem := range statsData { - deviceName := statsItem.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem) - } - - // place data device.DeviceGroupStats struct for every group of stats - deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName)) - for groupName, groupStats := range statsListByDeviceName { - deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp)) - } - - stats <- &device.StatsResponse{ - Groups: deviceGroupsStats, - } -} - -func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue { - return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)} -} - -// statsForGroup is a helper function that populates device.DeviceGroupStats -// for given groupName with groupStats list -func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats { - instanceStats := make(map[string]*device.DeviceStats) - for _, statsItem := range groupStats { - instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp) - } - - return &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - InstanceStats: instanceStats, - } -} - -// statsForItem is a helper function that populates device.DeviceStats for given -// nvml.StatsData -func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats { - // nvml.StatsData holds pointers to values that can be nil - // In case they are nil return stats with 'notAvailable' constant - var ( - powerUsageStat *structs.StatValue - GPUUtilizationStat *structs.StatValue - memoryUtilizationStat *structs.StatValue - encoderUtilizationStat *structs.StatValue - decoderUtilizationStat *structs.StatValue - temperatureStat *structs.StatValue - memoryStateStat *structs.StatValue - BAR1StateStat *structs.StatValue - ECCErrorsL1CacheStat *structs.StatValue - ECCErrorsL2CacheStat *structs.StatValue - ECCErrorsDeviceStat *structs.StatValue - ) - - if statsItem.PowerUsageW == nil || statsItem.PowerW == nil { - powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc) - } else { - powerUsageStat = &structs.StatValue{ - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)), - IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW), - } - } - - if statsItem.GPUUtilization == nil { - GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc) - } else { - GPUUtilizationStat = &structs.StatValue{ - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization), - } - } - - if statsItem.MemoryUtilization == nil { - memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc) - } else { - memoryUtilizationStat = &structs.StatValue{ - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization), - } - } - - if statsItem.EncoderUtilization == nil { - encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc) - } else { - encoderUtilizationStat = &structs.StatValue{ - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization), - } - } - - if statsItem.DecoderUtilization == nil { - decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc) - } else { - decoderUtilizationStat = &structs.StatValue{ - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization), - } - } - - if statsItem.TemperatureC == nil { - temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc) - } else { - temperatureStat = &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC), - } - } - - if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil { - memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc) - } else { - memoryStateStat = &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB), - } - } - - if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil { - BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc) - } else { - BAR1StateStat = &structs.StatValue{ - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB), - } - } - - if statsItem.ECCErrorsL1Cache == nil { - ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc) - } else { - ECCErrorsL1CacheStat = &structs.StatValue{ - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache), - } - } - - if statsItem.ECCErrorsL2Cache == nil { - ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc) - } else { - ECCErrorsL2CacheStat = &structs.StatValue{ - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache), - } - } - - if statsItem.ECCErrorsDevice == nil { - ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc) - } else { - ECCErrorsDeviceStat = &structs.StatValue{ - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice), - } - } - return &device.DeviceStats{ - Summary: memoryStateStat, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: powerUsageStat, - GPUUtilizationAttr: GPUUtilizationStat, - MemoryUtilizationAttr: memoryUtilizationStat, - EncoderUtilizationAttr: encoderUtilizationStat, - DecoderUtilizationAttr: decoderUtilizationStat, - TemperatureAttr: temperatureStat, - MemoryStateAttr: memoryStateStat, - BAR1StateAttr: BAR1StateStat, - ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat, - ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat, - ECCErrorsDeviceAttr: ECCErrorsDeviceStat, - }, - }, - Timestamp: timestamp, - } -} - -func uintToInt64Ptr(u *uint) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} - -func uint64ToInt64Ptr(u *uint64) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} diff --git a/devices/gpu/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go deleted file mode 100644 index f6221e0f480..00000000000 --- a/devices/gpu/nvidia/stats_test.go +++ /dev/null @@ -1,3041 +0,0 @@ -package nvidia - -import ( - "errors" - "sort" - "testing" - "time" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestFilterStatsByID(t *testing.T) { - for _, testCase := range []struct { - Name string - ProvidedStats []*nvml.StatsData - ProvidedIDs map[string]struct{} - ExpectedResult []*nvml.StatsData - }{ - { - Name: "All ids are in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Odd are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Even are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "No Stats were provided", - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - }, - { - Name: "No Ids were provided", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - } { - actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForItem(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - ItemStat *nvml.StatsData - ExpectedResult *device.DeviceStats - }{ - { - Name: "All fields in ItemStat are not nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Power usage is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: nil, - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "PowerW is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: nil, - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "GPUUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: nil, - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: nil, - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "EncoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: nil, - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "DecoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: nil, - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Temperature is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: nil, - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "UsedMemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: nil, - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1UsedMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: nil, - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1MiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: nil, - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL1Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: nil, - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL2Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: nil, - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsDevice is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: nil, - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - } { - actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForGroup(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - GroupStats []*nvml.StatsData - GroupName string - ExpectedResult *device.DeviceGroupStats - }{ - { - Name: "make sure that all data is transformed correctly", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - GroupName: "DeviceName1", - GroupStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - ExpectedResult: &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - } { - actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestWriteStatsToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - ExpectedWriteToChannel *device.StatsResponse - Timestamp time.Time - Device *NvidiaDevice - }{ - { - Name: "NVML wrapper returns error", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ExpectedWriteToChannel: &device.StatsResponse{ - Error: errors.New(""), - }, - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - StatsError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName3", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups 2", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that only devices from NvidiaDevice.device map stats are reported", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - } { - channel := make(chan *device.StatsResponse, 1) - testCase.Device.writeStatsToChannel(channel, testCase.Timestamp) - actualResult := <-channel - // writeStatsToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedWriteToChannel arrays has to be sorted firsted - sort.Slice(actualResult.Groups, func(i, j int) bool { - return actualResult.Groups[i].Name < actualResult.Groups[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name - }) - require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) - } -} diff --git a/go.mod b/go.mod index db72e0ab3e9..911b72a7663 100644 --- a/go.mod +++ b/go.mod @@ -23,7 +23,6 @@ require ( github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5 github.com/Microsoft/go-winio v0.4.15-0.20200113171025-3fe6c5262873 github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 // indirect - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 github.com/NYTimes/gziphandler v1.0.1 github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e github.com/armon/go-metrics v0.3.4 diff --git a/go.sum b/go.sum index d8f6cfda76a..6dfe3b5a41c 100644 --- a/go.sum +++ b/go.sum @@ -67,8 +67,6 @@ github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5/go.mod h1:nxQPcN github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ= github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 h1:2T9t72RkTRjAcuFc+4vaGWnRx/anVngE1/VGN/HFEVk= github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1/go.mod h1:LVvUcNYEzt59fFVTuiPEgM6dgF70yMGdy/Qc/UmCbuU= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA= github.com/NYTimes/gziphandler v1.0.0 h1:OswZCvpiFsNRCbeapdJxDuikAqVXTgV7XAht8S9olZo= github.com/NYTimes/gziphandler v1.0.0/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= diff --git a/helper/pluginutils/catalog/register_nvidia_linux.go b/helper/pluginutils/catalog/register_nvidia_linux.go deleted file mode 100644 index a50cbe833a7..00000000000 --- a/helper/pluginutils/catalog/register_nvidia_linux.go +++ /dev/null @@ -1,14 +0,0 @@ -// +build !nonvidia - -package catalog - -import ( - "github.com/hashicorp/nomad/devices/gpu/nvidia" -) - -// This file is where all builtin plugins should be registered in the catalog. -// Plugins with build restrictions should be placed in the appropriate -// register_XXX.go file. -func init() { - Register(nvidia.PluginID, nvidia.PluginConfig) -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE deleted file mode 100644 index 2a718d63da7..00000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2018, NVIDIA Corporation -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go deleted file mode 100644 index 4bba898342f..00000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go +++ /dev/null @@ -1,634 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -package nvml - -// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files -// #include "nvml_dl.h" -import "C" - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "sort" - "strconv" - "strings" -) - -const ( - szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE - szName = C.NVML_DEVICE_NAME_BUFFER_SIZE - szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE - szProcs = 32 - szProcName = 64 - - XidCriticalError = C.nvmlEventTypeXidCriticalError -) - -type handle struct{ dev C.nvmlDevice_t } -type EventSet struct{ set C.nvmlEventSet_t } -type Event struct { - UUID *string - Etype uint64 - Edata uint64 -} - -func uintPtr(c C.uint) *uint { - i := uint(c) - return &i -} - -func uint64Ptr(c C.ulonglong) *uint64 { - i := uint64(c) - return &i -} - -func stringPtr(c *C.char) *string { - s := C.GoString(c) - return &s -} - -func errorString(ret C.nvmlReturn_t) error { - if ret == C.NVML_SUCCESS { - return nil - } - err := C.GoString(C.nvmlErrorString(ret)) - return fmt.Errorf("nvml: %v", err) -} - -func init_() error { - r := C.nvmlInit_dl() - if r == C.NVML_ERROR_LIBRARY_NOT_FOUND { - return errors.New("could not load NVML library") - } - return errorString(r) -} - -func NewEventSet() EventSet { - var set C.nvmlEventSet_t - C.nvmlEventSetCreate(&set) - - return EventSet{set} -} - -func RegisterEvent(es EventSet, event int) error { - n, err := deviceGetCount() - if err != nil { - return err - } - - var i uint - for i = 0; i < n; i++ { - h, err := deviceGetHandleByIndex(i) - if err != nil { - return err - } - - r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) - if r != C.NVML_SUCCESS { - return errorString(r) - } - } - - return nil -} - -func RegisterEventForDevice(es EventSet, event int, uuid string) error { - n, err := deviceGetCount() - if err != nil { - return err - } - - var i uint - for i = 0; i < n; i++ { - h, err := deviceGetHandleByIndex(i) - if err != nil { - return err - } - - duuid, err := h.deviceGetUUID() - if err != nil { - return err - } - - if *duuid != uuid { - continue - } - - r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) - if r != C.NVML_SUCCESS { - return errorString(r) - } - - return nil - } - - return fmt.Errorf("nvml: device not found") -} - -func DeleteEventSet(es EventSet) { - C.nvmlEventSetFree(es.set) -} - -func WaitForEvent(es EventSet, timeout uint) (Event, error) { - var data C.nvmlEventData_t - - r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout)) - uuid, _ := handle{data.device}.deviceGetUUID() - - return Event{ - UUID: uuid, - Etype: uint64(data.eventType), - Edata: uint64(data.eventData), - }, - errorString(r) -} - -func shutdown() error { - return errorString(C.nvmlShutdown_dl()) -} - -func systemGetDriverVersion() (string, error) { - var driver [szDriver]C.char - - r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver) - return C.GoString(&driver[0]), errorString(r) -} - -func systemGetProcessName(pid uint) (string, error) { - var proc [szProcName]C.char - - r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName) - return C.GoString(&proc[0]), errorString(r) -} - -func deviceGetCount() (uint, error) { - var n C.uint - - r := C.nvmlDeviceGetCount(&n) - return uint(n), errorString(r) -} - -func deviceGetHandleByIndex(idx uint) (handle, error) { - var dev C.nvmlDevice_t - - r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev) - return handle{dev}, errorString(r) -} - -func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) { - var level C.nvmlGpuTopologyLevel_t - - r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level) - if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(C.uint(level)), errorString(r) -} - -func (h handle) deviceGetName() (*string, error) { - var name [szName]C.char - - r := C.nvmlDeviceGetName(h.dev, &name[0], szName) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&name[0]), errorString(r) -} - -func (h handle) deviceGetUUID() (*string, error) { - var uuid [szUUID]C.char - - r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&uuid[0]), errorString(r) -} - -func (h handle) deviceGetPciInfo() (*string, error) { - var pci C.nvmlPciInfo_t - - r := C.nvmlDeviceGetPciInfo(h.dev, &pci) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&pci.busId[0]), errorString(r) -} - -func (h handle) deviceGetMinorNumber() (*uint, error) { - var minor C.uint - - r := C.nvmlDeviceGetMinorNumber(h.dev, &minor) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(minor), errorString(r) -} - -func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) { - var bar1 C.nvmlBAR1Memory_t - - r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r) -} - -func (h handle) deviceGetPowerManagementLimit() (*uint, error) { - var power C.uint - - r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(power), errorString(r) -} - -func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) { - var sm, mem C.uint - - r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) - } - return uintPtr(sm), uintPtr(mem), errorString(r) -} - -func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) { - var link C.uint - - r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(link), errorString(r) -} - -func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) { - var width C.uint - - r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(width), errorString(r) -} - -func (h handle) deviceGetPowerUsage() (*uint, error) { - var power C.uint - - r := C.nvmlDeviceGetPowerUsage(h.dev, &power) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(power), errorString(r) -} - -func (h handle) deviceGetTemperature() (*uint, error) { - var temp C.uint - - r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(temp), errorString(r) -} - -func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) { - var usage C.nvmlUtilization_t - - r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r) -} - -func (h handle) deviceGetEncoderUtilization() (*uint, error) { - var usage, sampling C.uint - - r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(usage), errorString(r) -} - -func (h handle) deviceGetDecoderUtilization() (*uint, error) { - var usage, sampling C.uint - - r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(usage), errorString(r) -} - -func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) { - var mem C.nvmlMemory_t - - r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - err = errorString(r) - if r != C.NVML_SUCCESS { - return - } - - totalMem = uint64Ptr(mem.total) - if totalMem != nil { - *totalMem /= 1024 * 1024 // MiB - } - - devMem = DeviceMemory{ - Used: uint64Ptr(mem.used), - Free: uint64Ptr(mem.free), - } - - if devMem.Used != nil { - *devMem.Used /= 1024 * 1024 // MiB - } - - if devMem.Free != nil { - *devMem.Free /= 1024 * 1024 // MiB - } - return -} - -func (h handle) deviceGetClockInfo() (*uint, *uint, error) { - var sm, mem C.uint - - r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) - } - return uintPtr(sm), uintPtr(mem), errorString(r) -} - -func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) { - var l1, l2, mem C.ulonglong - - r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2) - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem) - } - return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r) -} - -func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) { - var rx, tx C.uint - - r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx) - } - return uintPtr(rx), uintPtr(tx), errorString(r) -} - -func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) { - var procs [szProcs]C.nvmlProcessInfo_t - var count = C.uint(szProcs) - - r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0]) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - n := int(count) - pids := make([]uint, n) - mems := make([]uint64, n) - for i := 0; i < n; i++ { - pids[i] = uint(procs[i].pid) - mems[i] = uint64(procs[i].usedGpuMemory) - } - return pids, mems, errorString(r) -} - -func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) { - var procs [szProcs]C.nvmlProcessInfo_t - var count = C.uint(szProcs) - - r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0]) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - n := int(count) - pids := make([]uint, n) - mems := make([]uint64, n) - for i := 0; i < n; i++ { - pids[i] = uint(procs[i].pid) - mems[i] = uint64(procs[i].usedGpuMemory) - } - return pids, mems, errorString(r) -} - -func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) { - cPids, cpMems, err := h.deviceGetComputeRunningProcesses() - if err != nil { - return nil, err - } - - gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses() - if err != nil { - return nil, err - } - - allPids := make(map[uint]ProcessInfo) - - for i, pid := range cPids { - name, err := processName(pid) - if err != nil { - return nil, err - } - allPids[pid] = ProcessInfo{ - PID: pid, - Name: name, - MemoryUsed: cpMems[i] / (1024 * 1024), // MiB - Type: Compute, - } - - } - - for i, pid := range gPids { - pInfo, exists := allPids[pid] - if exists { - pInfo.Type = ComputeAndGraphics - allPids[pid] = pInfo - } else { - name, err := processName(pid) - if err != nil { - return nil, err - } - allPids[pid] = ProcessInfo{ - PID: pid, - Name: name, - MemoryUsed: gpMems[i] / (1024 * 1024), // MiB - Type: Graphics, - } - } - } - - var processInfo []ProcessInfo - for _, v := range allPids { - processInfo = append(processInfo, v) - } - sort.Slice(processInfo, func(i, j int) bool { - return processInfo[i].PID < processInfo[j].PID - }) - - return processInfo, nil -} - -func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) { - var clocksThrottleReasons C.ulonglong - - r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons) - - if r == C.NVML_ERROR_NOT_SUPPORTED { - return ThrottleReasonUnknown, nil - } - - if r != C.NVML_SUCCESS { - return ThrottleReasonUnknown, errorString(r) - } - - switch clocksThrottleReasons { - case C.nvmlClocksThrottleReasonGpuIdle: - reason = ThrottleReasonGpuIdle - case C.nvmlClocksThrottleReasonApplicationsClocksSetting: - reason = ThrottleReasonApplicationsClocksSetting - case C.nvmlClocksThrottleReasonSwPowerCap: - reason = ThrottleReasonSwPowerCap - case C.nvmlClocksThrottleReasonHwSlowdown: - reason = ThrottleReasonHwSlowdown - case C.nvmlClocksThrottleReasonSyncBoost: - reason = ThrottleReasonSyncBoost - case C.nvmlClocksThrottleReasonSwThermalSlowdown: - reason = ThrottleReasonSwThermalSlowdown - case C.nvmlClocksThrottleReasonHwThermalSlowdown: - reason = ThrottleReasonHwThermalSlowdown - case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown: - reason = ThrottleReasonHwPowerBrakeSlowdown - case C.nvmlClocksThrottleReasonDisplayClockSetting: - reason = ThrottleReasonDisplayClockSetting - case C.nvmlClocksThrottleReasonNone: - reason = ThrottleReasonNone - } - return -} - -func (h handle) getPerformanceState() (PerfState, error) { - var pstate C.nvmlPstates_t - - r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate) - - if r == C.NVML_ERROR_NOT_SUPPORTED { - return PerfStateUnknown, nil - } - - if r != C.NVML_SUCCESS { - return PerfStateUnknown, errorString(r) - } - return PerfState(pstate), nil -} - -func processName(pid uint) (string, error) { - f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm` - d, err := ioutil.ReadFile(f) - - if err != nil { - // TOCTOU: process terminated - if os.IsNotExist(err) { - return "", nil - } - return "", err - } - return strings.TrimSuffix(string(d), "\n"), err -} - -func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) { - var mode C.nvmlEnableState_t - var buffer C.uint - - r := C.nvmlDeviceGetAccountingMode(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return accountingInfo, errorString(r) - } - - r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return accountingInfo, errorString(r) - } - - accountingInfo = Accounting{ - Mode: ModeState(mode), - BufferSize: uintPtr(buffer), - } - return -} - -func (h handle) getDisplayInfo() (display Display, err error) { - var mode, isActive C.nvmlEnableState_t - - r := C.nvmlDeviceGetDisplayActive(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return display, errorString(r) - } - - r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - if r != C.NVML_SUCCESS { - return display, errorString(r) - } - display = Display{ - Mode: ModeState(mode), - Active: ModeState(isActive), - } - return -} - -func (h handle) getPeristenceMode() (state ModeState, err error) { - var mode C.nvmlEnableState_t - - r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - return ModeState(mode), errorString(r) -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go deleted file mode 100644 index f6ec9e8fae3..00000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go +++ /dev/null @@ -1,533 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -package nvml - -// #include "nvml_dl.h" -import "C" - -import ( - "bytes" - "errors" - "fmt" - "io/ioutil" - "strconv" - "strings" -) - -var ( - ErrCPUAffinity = errors.New("failed to retrieve CPU affinity") - ErrUnsupportedP2PLink = errors.New("unsupported P2P link type") - ErrUnsupportedGPU = errors.New("unsupported GPU device") -) - -type ModeState uint - -const ( - Enabled ModeState = iota - Disabled -) - -func (m ModeState) String() string { - switch m { - case Enabled: - return "Enabled" - case Disabled: - return "Disabled" - } - return "N/A" -} - -type Display struct { - Mode ModeState - Active ModeState -} - -type Accounting struct { - Mode ModeState - BufferSize *uint -} - -type DeviceMode struct { - DisplayInfo Display - Persistence ModeState - AccountingInfo Accounting -} - -type ThrottleReason uint - -const ( - ThrottleReasonGpuIdle ThrottleReason = iota - ThrottleReasonApplicationsClocksSetting - ThrottleReasonSwPowerCap - ThrottleReasonHwSlowdown - ThrottleReasonSyncBoost - ThrottleReasonSwThermalSlowdown - ThrottleReasonHwThermalSlowdown - ThrottleReasonHwPowerBrakeSlowdown - ThrottleReasonDisplayClockSetting - ThrottleReasonNone - ThrottleReasonUnknown -) - -func (r ThrottleReason) String() string { - switch r { - case ThrottleReasonGpuIdle: - return "Gpu Idle" - case ThrottleReasonApplicationsClocksSetting: - return "Applications Clocks Setting" - case ThrottleReasonSwPowerCap: - return "SW Power Cap" - case ThrottleReasonHwSlowdown: - return "HW Slowdown" - case ThrottleReasonSyncBoost: - return "Sync Boost" - case ThrottleReasonSwThermalSlowdown: - return "SW Thermal Slowdown" - case ThrottleReasonHwThermalSlowdown: - return "HW Thermal Slowdown" - case ThrottleReasonHwPowerBrakeSlowdown: - return "HW Power Brake Slowdown" - case ThrottleReasonDisplayClockSetting: - return "Display Clock Setting" - case ThrottleReasonNone: - return "No clocks throttling" - } - return "N/A" -} - -type PerfState uint - -const ( - PerfStateMax = 0 - PerfStateMin = 15 - PerfStateUnknown = 32 -) - -func (p PerfState) String() string { - if p >= PerfStateMax && p <= PerfStateMin { - return fmt.Sprintf("P%d", p) - } - return "Unknown" -} - -type ProcessType uint - -const ( - Compute ProcessType = iota - Graphics - ComputeAndGraphics -) - -func (t ProcessType) String() string { - typ := "C+G" - if t == Compute { - typ = "C" - } else if t == Graphics { - typ = "G" - } - return typ -} - -type P2PLinkType uint - -const ( - P2PLinkUnknown P2PLinkType = iota - P2PLinkCrossCPU - P2PLinkSameCPU - P2PLinkHostBridge - P2PLinkMultiSwitch - P2PLinkSingleSwitch - P2PLinkSameBoard -) - -type P2PLink struct { - BusID string - Link P2PLinkType -} - -func (t P2PLinkType) String() string { - switch t { - case P2PLinkCrossCPU: - return "Cross CPU socket" - case P2PLinkSameCPU: - return "Same CPU socket" - case P2PLinkHostBridge: - return "Host PCI bridge" - case P2PLinkMultiSwitch: - return "Multiple PCI switches" - case P2PLinkSingleSwitch: - return "Single PCI switch" - case P2PLinkSameBoard: - return "Same board" - case P2PLinkUnknown: - } - return "N/A" -} - -type ClockInfo struct { - Cores *uint - Memory *uint -} - -type PCIInfo struct { - BusID string - BAR1 *uint64 - Bandwidth *uint -} - -type Device struct { - handle - - UUID string - Path string - Model *string - Power *uint - Memory *uint64 - CPUAffinity *uint - PCI PCIInfo - Clocks ClockInfo - Topology []P2PLink -} - -type UtilizationInfo struct { - GPU *uint - Memory *uint - Encoder *uint - Decoder *uint -} - -type PCIThroughputInfo struct { - RX *uint - TX *uint -} - -type PCIStatusInfo struct { - BAR1Used *uint64 - Throughput PCIThroughputInfo -} - -type ECCErrorsInfo struct { - L1Cache *uint64 - L2Cache *uint64 - Device *uint64 -} - -type DeviceMemory struct { - Used *uint64 - Free *uint64 -} - -type MemoryInfo struct { - Global DeviceMemory - ECCErrors ECCErrorsInfo -} - -type ProcessInfo struct { - PID uint - Name string - MemoryUsed uint64 - Type ProcessType -} - -type DeviceStatus struct { - Power *uint - Temperature *uint - Utilization UtilizationInfo - Memory MemoryInfo - Clocks ClockInfo - PCI PCIStatusInfo - Processes []ProcessInfo - Throttle ThrottleReason - Performance PerfState -} - -func assert(err error) { - if err != nil { - panic(err) - } -} - -func Init() error { - return init_() -} - -func Shutdown() error { - return shutdown() -} - -func GetDeviceCount() (uint, error) { - return deviceGetCount() -} - -func GetDriverVersion() (string, error) { - return systemGetDriverVersion() -} - -func numaNode(busid string) (uint, error) { - // discard leading zeros of busid - b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:]))) - if err != nil { - // XXX report node 0 if NUMA support isn't enabled - return 0, nil - } - node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8) - if err != nil { - return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err) - } - if node < 0 { - node = 0 // XXX report node 0 instead of NUMA_NO_NODE - } - return uint(node), nil -} - -func pciBandwidth(gen, width *uint) *uint { - m := map[uint]uint{ - 1: 250, // MB/s - 2: 500, - 3: 985, - 4: 1969, - } - if gen == nil || width == nil { - return nil - } - bw := m[*gen] * *width - return &bw -} - -func NewDevice(idx uint) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByIndex(idx) - assert(err) - model, err := h.deviceGetName() - assert(err) - uuid, err := h.deviceGetUUID() - assert(err) - minor, err := h.deviceGetMinorNumber() - assert(err) - power, err := h.deviceGetPowerManagementLimit() - assert(err) - totalMem, _, err := h.deviceGetMemoryInfo() - assert(err) - busid, err := h.deviceGetPciInfo() - assert(err) - bar1, _, err := h.deviceGetBAR1MemoryInfo() - assert(err) - pcig, err := h.deviceGetMaxPcieLinkGeneration() - assert(err) - pciw, err := h.deviceGetMaxPcieLinkWidth() - assert(err) - ccore, cmem, err := h.deviceGetMaxClockInfo() - assert(err) - - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - path := fmt.Sprintf("/dev/nvidia%d", *minor) - node, err := numaNode(*busid) - assert(err) - - device = &Device{ - handle: h, - UUID: *uuid, - Path: path, - Model: model, - Power: power, - Memory: totalMem, - CPUAffinity: &node, - PCI: PCIInfo{ - BusID: *busid, - BAR1: bar1, - Bandwidth: pciBandwidth(pcig, pciw), // MB/s - }, - Clocks: ClockInfo{ - Cores: ccore, // MHz - Memory: cmem, // MHz - }, - } - if power != nil { - *device.Power /= 1000 // W - } - if bar1 != nil { - *device.PCI.BAR1 /= 1024 * 1024 // MiB - } - return -} - -func NewDeviceLite(idx uint) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByIndex(idx) - assert(err) - uuid, err := h.deviceGetUUID() - assert(err) - minor, err := h.deviceGetMinorNumber() - assert(err) - busid, err := h.deviceGetPciInfo() - assert(err) - - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - path := fmt.Sprintf("/dev/nvidia%d", *minor) - - device = &Device{ - handle: h, - UUID: *uuid, - Path: path, - PCI: PCIInfo{ - BusID: *busid, - }, - } - return -} - -func (d *Device) Status() (status *DeviceStatus, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - power, err := d.deviceGetPowerUsage() - assert(err) - temp, err := d.deviceGetTemperature() - assert(err) - ugpu, umem, err := d.deviceGetUtilizationRates() - assert(err) - uenc, err := d.deviceGetEncoderUtilization() - assert(err) - udec, err := d.deviceGetDecoderUtilization() - assert(err) - _, devMem, err := d.deviceGetMemoryInfo() - assert(err) - ccore, cmem, err := d.deviceGetClockInfo() - assert(err) - _, bar1, err := d.deviceGetBAR1MemoryInfo() - assert(err) - el1, el2, emem, err := d.deviceGetMemoryErrorCounter() - assert(err) - pcirx, pcitx, err := d.deviceGetPcieThroughput() - assert(err) - throttle, err := d.getClocksThrottleReasons() - assert(err) - perfState, err := d.getPerformanceState() - assert(err) - processInfo, err := d.deviceGetAllRunningProcesses() - assert(err) - - status = &DeviceStatus{ - Power: power, - Temperature: temp, // °C - Utilization: UtilizationInfo{ - GPU: ugpu, // % - Memory: umem, // % - Encoder: uenc, // % - Decoder: udec, // % - }, - Memory: MemoryInfo{ - Global: devMem, - ECCErrors: ECCErrorsInfo{ - L1Cache: el1, - L2Cache: el2, - Device: emem, - }, - }, - Clocks: ClockInfo{ - Cores: ccore, // MHz - Memory: cmem, // MHz - }, - PCI: PCIStatusInfo{ - BAR1Used: bar1, - Throughput: PCIThroughputInfo{ - RX: pcirx, - TX: pcitx, - }, - }, - Throttle: throttle, - Performance: perfState, - Processes: processInfo, - } - if power != nil { - *status.Power /= 1000 // W - } - if bar1 != nil { - *status.PCI.BAR1Used /= 1024 * 1024 // MiB - } - if pcirx != nil { - *status.PCI.Throughput.RX /= 1000 // MB/s - } - if pcitx != nil { - *status.PCI.Throughput.TX /= 1000 // MB/s - } - return -} - -func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) { - level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle) - if err != nil || level == nil { - return P2PLinkUnknown, err - } - - switch *level { - case C.NVML_TOPOLOGY_INTERNAL: - link = P2PLinkSameBoard - case C.NVML_TOPOLOGY_SINGLE: - link = P2PLinkSingleSwitch - case C.NVML_TOPOLOGY_MULTIPLE: - link = P2PLinkMultiSwitch - case C.NVML_TOPOLOGY_HOSTBRIDGE: - link = P2PLinkHostBridge - case C.NVML_TOPOLOGY_CPU: - link = P2PLinkSameCPU - case C.NVML_TOPOLOGY_SYSTEM: - link = P2PLinkCrossCPU - default: - err = ErrUnsupportedP2PLink - } - return -} - -func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) { - return d.handle.deviceGetComputeRunningProcesses() -} - -func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) { - return d.handle.deviceGetGraphicsRunningProcesses() -} - -func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) { - return d.handle.deviceGetAllRunningProcesses() -} - -func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - display, err := d.getDisplayInfo() - assert(err) - - p, err := d.getPeristenceMode() - assert(err) - - accounting, err := d.getAccountingInfo() - assert(err) - - mode = &DeviceMode{ - DisplayInfo: display, - Persistence: p, - AccountingInfo: accounting, - } - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h deleted file mode 100644 index 60185dac239..00000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h +++ /dev/null @@ -1,5871 +0,0 @@ -/* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO USER: - * - * This source code is subject to NVIDIA ownership rights under U.S. and - * international Copyright laws. Users and possessors of this source code - * are hereby granted a nonexclusive, royalty-free license to use this code - * in individual and commercial software. - * - * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE - * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR - * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH - * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, - * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE - * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE - * OR PERFORMANCE OF THIS SOURCE CODE. - * - * U.S. Government End Users. This source code is a "commercial item" as - * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - * "commercial computer software" and "commercial computer software - * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) - * and is provided to the U.S. Government only as a commercial end item. - * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through - * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the - * source code with only those rights set forth herein. - * - * Any use of this source code in individual and commercial software must - * include, in the user documentation and internal comments to the code, - * the above Disclaimer and U.S. Government End Users Notice. - */ - -/* -NVML API Reference - -The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and -managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building -3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi -tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. - -API Documentation - -Supported platforms: -- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit -- Linux: 32-bit and 64-bit -- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 - -Supported products: -- Full Support - - All Tesla products, starting with the Fermi architecture - - All Quadro products, starting with the Fermi architecture - - All GRID products, starting with the Kepler architecture - - Selected GeForce Titan products -- Limited Support - - All Geforce products, starting with the Fermi architecture - -The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is -not be added to the system path by default. To dynamically link to NVML, add this path to the PATH -environmental variable. To dynamically load NVML, call LoadLibrary with this path. - -On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit -and 64 bit NVML libraries will be installed. - -Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html -*/ - -#ifndef __nvml_nvml_h__ -#define __nvml_nvml_h__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * On Windows, set up methods for DLL export - * define NVML_STATIC_IMPORT when using nvml_loader library - */ -#if defined _WINDOWS - #if !defined NVML_STATIC_IMPORT - #if defined NVML_LIB_EXPORT - #define DECLDIR __declspec(dllexport) - #else - #define DECLDIR __declspec(dllimport) - #endif - #else - #define DECLDIR - #endif -#else - #define DECLDIR -#endif - -/** - * NVML API versioning support - */ -#define NVML_API_VERSION 9 -#define NVML_API_VERSION_STR "9" -#define nvmlInit nvmlInit_v2 -#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 -#define nvmlDeviceGetCount nvmlDeviceGetCount_v2 -#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 -#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 -#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 -#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceStructs Device Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Special constant that some fields take when they are not available. - * Used when only part of the struct is not available. - * - * Each structure explicitly states when to check for this value. - */ -#define NVML_VALUE_NOT_AVAILABLE (-1) - -typedef struct nvmlDevice_st* nvmlDevice_t; - -/** - * Buffer size guaranteed to be large enough for pci bus id - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 - -/** - * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 - -/** - * PCI information about a GPU device. - */ -typedef struct nvmlPciInfo_st -{ - char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) - unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff - unsigned int bus; //!< The bus on which the device resides, 0 to 0xff - unsigned int device; //!< The device's id on the bus, 0 to 31 - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - - // Added in NVML 2.285 API - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) -} nvmlPciInfo_t; - -/** - * Detailed ECC error counts for a device. - * - * @deprecated Different GPU families can have different memory error counters - * See \ref nvmlDeviceGetMemoryErrorCounter - */ -typedef struct nvmlEccErrorCounts_st -{ - unsigned long long l1Cache; //!< L1 cache errors - unsigned long long l2Cache; //!< L2 cache errors - unsigned long long deviceMemory; //!< Device memory errors - unsigned long long registerFile; //!< Register file errors -} nvmlEccErrorCounts_t; - -/** - * Utilization information for a device. - * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. - */ -typedef struct nvmlUtilization_st -{ - unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU - unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written -} nvmlUtilization_t; - -/** - * Memory allocation information for a device. - */ -typedef struct nvmlMemory_st -{ - unsigned long long total; //!< Total installed FB memory (in bytes) - unsigned long long free; //!< Unallocated FB memory (in bytes) - unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping -} nvmlMemory_t; - -/** - * BAR1 Memory allocation Information for a device - */ -typedef struct nvmlBAR1Memory_st -{ - unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) - unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) - unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) -}nvmlBAR1Memory_t; - -/** - * Information about running compute processes on the GPU - */ -typedef struct nvmlProcessInfo_st -{ - unsigned int pid; //!< Process ID - unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. - //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported - //! because Windows KMD manages all the memory and not the NVIDIA driver -} nvmlProcessInfo_t; - -/** - * Enum to represent type of bridge chip - */ -typedef enum nvmlBridgeChipType_enum -{ - NVML_BRIDGE_CHIP_PLX = 0, - NVML_BRIDGE_CHIP_BRO4 = 1 -}nvmlBridgeChipType_t; - -/** - * Maximum number of NvLink links supported - */ -#define NVML_NVLINK_MAX_LINKS 6 - -/** - * Enum to represent the NvLink utilization counter packet units - */ -typedef enum nvmlNvLinkUtilizationCountUnits_enum -{ - NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles - NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets - NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes - - // this must be last - NVML_NVLINK_COUNTER_UNIT_COUNT -} nvmlNvLinkUtilizationCountUnits_t; - -/** - * Enum to represent the NvLink utilization counter packet types to count - * ** this is ONLY applicable with the units as packets or bytes - * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t - * ** all packet filter descriptions are target GPU centric - * ** these can be "OR'd" together - */ -typedef enum nvmlNvLinkUtilizationCountPktTypes_enum -{ - NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets - NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets - NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets - NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests - NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data - NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data - NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets -} nvmlNvLinkUtilizationCountPktTypes_t; - -/** - * Struct to define the NVLINK counter controls - */ -typedef struct nvmlNvLinkUtilizationControl_st -{ - nvmlNvLinkUtilizationCountUnits_t units; - nvmlNvLinkUtilizationCountPktTypes_t pktfilter; -} nvmlNvLinkUtilizationControl_t; - -/** - * Enum to represent NvLink queryable capabilities - */ -typedef enum nvmlNvLinkCapability_enum -{ - NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported - NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported - NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported - NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported - NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link - NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device - // should be last - NVML_NVLINK_CAP_COUNT -} nvmlNvLinkCapability_t; - -/** - * Enum to represent NvLink queryable error counters - */ -typedef enum nvmlNvLinkErrorCounter_enum -{ - NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter - NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter - NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter - NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter - - // this must be last - NVML_NVLINK_ERROR_COUNT -} nvmlNvLinkErrorCounter_t; - -/** - * Represents level relationships within a system between two GPUs - * The enums are spaced to allow for future relationships - */ -typedef enum nvmlGpuLevel_enum -{ - NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 - NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch - NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge - NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge - NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges - NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system - - // there is purposefully no COUNT here because of the need for spacing above -} nvmlGpuTopologyLevel_t; - -/* Compatibility for CPU->NODE renaming */ -#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE - -/* P2P Capability Index Status*/ -typedef enum nvmlGpuP2PStatus_enum -{ - NVML_P2P_STATUS_OK = 0, - NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, - NVML_P2P_STATUS_GPU_NOT_SUPPORTED, - NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, - NVML_P2P_STATUS_DISABLED_BY_REGKEY, - NVML_P2P_STATUS_NOT_SUPPORTED, - NVML_P2P_STATUS_UNKNOWN - -} nvmlGpuP2PStatus_t; - -/* P2P Capability Index*/ -typedef enum nvmlGpuP2PCapsIndex_enum -{ - NVML_P2P_CAPS_INDEX_READ = 0, - NVML_P2P_CAPS_INDEX_WRITE, - NVML_P2P_CAPS_INDEX_NVLINK, - NVML_P2P_CAPS_INDEX_ATOMICS, - NVML_P2P_CAPS_INDEX_PROP, - NVML_P2P_CAPS_INDEX_UNKNOWN -}nvmlGpuP2PCapsIndex_t; - -/** - * Maximum limit on Physical Bridges per Board - */ -#define NVML_MAX_PHYSICAL_BRIDGE (128) - -/** - * Information about the Bridge Chip Firmware - */ -typedef struct nvmlBridgeChipInfo_st -{ - nvmlBridgeChipType_t type; //!< Type of Bridge Chip - unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable -}nvmlBridgeChipInfo_t; - -/** - * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate - * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. - */ -typedef struct nvmlBridgeChipHierarchy_st -{ - unsigned char bridgeCount; //!< Number of Bridge Chips on the Board - nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board -}nvmlBridgeChipHierarchy_t; - -/** - * Represents Type of Sampling Event - */ -typedef enum nvmlSamplingType_enum -{ - NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU - NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU - NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written - NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy - NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy - NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples - NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples - - // Keep this last - NVML_SAMPLINGTYPE_COUNT -}nvmlSamplingType_t; - -/** - * Represents the queryable PCIe utilization counters - */ -typedef enum nvmlPcieUtilCounter_enum -{ - NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity - NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity - - // Keep this last - NVML_PCIE_UTIL_COUNT -} nvmlPcieUtilCounter_t; - -/** - * Represents the type for sample value returned - */ -typedef enum nvmlValueType_enum -{ - NVML_VALUE_TYPE_DOUBLE = 0, - NVML_VALUE_TYPE_UNSIGNED_INT = 1, - NVML_VALUE_TYPE_UNSIGNED_LONG = 2, - NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, - NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, - - // Keep this last - NVML_VALUE_TYPE_COUNT -}nvmlValueType_t; - - -/** - * Union to represent different types of Value - */ -typedef union nvmlValue_st -{ - double dVal; //!< If the value is double - unsigned int uiVal; //!< If the value is unsigned int - unsigned long ulVal; //!< If the value is unsigned long - unsigned long long ullVal; //!< If the value is unsigned long long - signed long long sllVal; //!< If the value is signed long long -}nvmlValue_t; - -/** - * Information for Sample - */ -typedef struct nvmlSample_st -{ - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t sampleValue; //!< Sample Value -}nvmlSample_t; - -/** - * Represents type of perf policy for which violation times can be queried - */ -typedef enum nvmlPerfPolicyType_enum -{ - NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks - NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks - NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks - NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks - NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks - NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks - - NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) - NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks - - // Keep this last - NVML_PERF_POLICY_COUNT -}nvmlPerfPolicyType_t; - -/** - * Struct to hold perf policy violation status data - */ -typedef struct nvmlViolationTime_st -{ - unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds - unsigned long long violationTime; //!< violationTime in Nanoseconds -}nvmlViolationTime_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceEnumvs Device Enums - * @{ - */ -/***************************************************************************************************/ - -/** - * Generic enable/disable enum. - */ -typedef enum nvmlEnableState_enum -{ - NVML_FEATURE_DISABLED = 0, //!< Feature disabled - NVML_FEATURE_ENABLED = 1 //!< Feature enabled -} nvmlEnableState_t; - -//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. -#define nvmlFlagDefault 0x00 -//! Generic flag used to force some behavior. See description of particular functions for details. -#define nvmlFlagForce 0x01 - -/** - * * The Brand of the GPU - * */ -typedef enum nvmlBrandType_enum -{ - NVML_BRAND_UNKNOWN = 0, - NVML_BRAND_QUADRO = 1, - NVML_BRAND_TESLA = 2, - NVML_BRAND_NVS = 3, - NVML_BRAND_GRID = 4, - NVML_BRAND_GEFORCE = 5, - NVML_BRAND_TITAN = 6, - - // Keep this last - NVML_BRAND_COUNT -} nvmlBrandType_t; - -/** - * Temperature thresholds. - */ -typedef enum nvmlTemperatureThresholds_enum -{ - NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down - // for HW protection - NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown - NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown - NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock - // Keep this last - NVML_TEMPERATURE_THRESHOLD_COUNT -} nvmlTemperatureThresholds_t; - -/** - * Temperature sensors. - */ -typedef enum nvmlTemperatureSensors_enum -{ - NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die - - // Keep this last - NVML_TEMPERATURE_COUNT -} nvmlTemperatureSensors_t; - -/** - * Compute mode. - * - * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. - * Earlier CUDA versions supported a single exclusive mode, - * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. - */ -typedef enum nvmlComputeMode_enum -{ - NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed - NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - - // Keep this last - NVML_COMPUTEMODE_COUNT -} nvmlComputeMode_t; - -/** - * ECC bit types. - * - * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type - */ -#define nvmlEccBitType_t nvmlMemoryErrorType_t - -/** - * Single bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED - */ -#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED - -/** - * Double bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED - */ -#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED - -/** - * Memory error types - */ -typedef enum nvmlMemoryErrorType_enum -{ - /** - * A memory error that was corrected - * - * For ECC errors, these are single bit errors - * For Texture memory, these are errors fixed by resend - */ - NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, - /** - * A memory error that was not corrected - * - * For ECC errors, these are double bit errors - * For Texture memory, these are errors where the resend fails - */ - NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, - - - // Keep this last - NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types - -} nvmlMemoryErrorType_t; - -/** - * ECC counter types. - * - * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. - * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver - * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app - * is run. - */ -typedef enum nvmlEccCounterType_enum -{ - NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. - NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) - - // Keep this last - NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types -} nvmlEccCounterType_t; - -/** - * Clock types. - * - * All speeds are in Mhz. - */ -typedef enum nvmlClockType_enum -{ - NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain - NVML_CLOCK_SM = 1, //!< SM clock domain - NVML_CLOCK_MEM = 2, //!< Memory clock domain - NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain - - // Keep this last - NVML_CLOCK_COUNT //usedGpuMemory is not supported - - - unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if - //!< the process is not terminated - - unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process - - unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) - - unsigned int reserved[5]; //!< Reserved for future use -} nvmlAccountingStats_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuConstants Vgpu Constants - * @{ - */ -/***************************************************************************************************/ - -/** - * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense - */ -#define NVML_GRID_LICENSE_BUFFER_SIZE 128 - -#define NVML_VGPU_NAME_BUFFER_SIZE 64 - -#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 - -/*! - * Macros for pGPU's virtualization capabilities bitfield. - */ -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuEnum Vgpu Enum - * @{ - */ -/***************************************************************************************************/ - -/*! - * Types of VM identifiers - */ -typedef enum nvmlVgpuVmIdType { - NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID - NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID -} nvmlVgpuVmIdType_t; - -// vGPU GUEST info state. -typedef enum nvmlVgpuGuestInfoState_enum -{ - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //= 0 and < \a unitCount - * @param unit Reference in which to return the unit handle - * - * @return - * - \ref NVML_SUCCESS if \a unit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); - -/** - * Retrieves the static information associated with a unit. - * - * For S-class products. - * - * See \ref nvmlUnitInfo_t for details on available unit info. - * - * @param unit The identifier of the target unit - * @param info Reference in which to return the unit information - * - * @return - * - \ref NVML_SUCCESS if \a info has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL - */ -nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); - -/** - * Retrieves the LED state associated with this unit. - * - * For S-class products. - * - * See \ref nvmlLedState_t for details on allowed states. - * - * @param unit The identifier of the target unit - * @param state Reference in which to return the current LED state - * - * @return - * - \ref NVML_SUCCESS if \a state has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitSetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); - -/** - * Retrieves the PSU stats for the unit. - * - * For S-class products. - * - * See \ref nvmlPSUInfo_t for details on available PSU info. - * - * @param unit The identifier of the target unit - * @param psu Reference in which to return the PSU information - * - * @return - * - \ref NVML_SUCCESS if \a psu has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); - -/** - * Retrieves the temperature readings for the unit, in degrees C. - * - * For S-class products. - * - * Depending on the product, readings may be available for intake (type=0), - * exhaust (type=1) and board (type=2). - * - * @param unit The identifier of the target unit - * @param type The type of reading to take - * @param temp Reference in which to return the intake temperature - * - * @return - * - \ref NVML_SUCCESS if \a temp has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); - -/** - * Retrieves the fan speed readings for the unit. - * - * For S-class products. - * - * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. - * - * @param unit The identifier of the target unit - * @param fanSpeeds Reference in which to return the fan speed information - * - * @return - * - \ref NVML_SUCCESS if \a fanSpeeds has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); - -/** - * Retrieves the set of GPU devices that are attached to the specified unit. - * - * For S-class products. - * - * The \a deviceCount argument is expected to be set to the size of the input \a devices array. - * - * @param unit The identifier of the target unit - * @param deviceCount Reference in which to provide the \a devices array size, and - * to return the number of attached GPU devices - * @param devices Reference in which to return the references to the attached GPU devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); - -/** - * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * - * For S-class products. - * - * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. - * The HIC must be connected to an S-class system for it to be reported by this function. - * - * @param hwbcCount Size of hwbcEntries array - * @param hwbcEntries Array holding information about hwbc - * - * @return - * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceQueries Device Queries - * This chapter describes that queries that NVML can perform against each device. - * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by - * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(), - * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of compute devices in the system. A compute device is a single GPU. - * - * For all products. - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * @param deviceCount Reference in which to return the number of accessible devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); - -/** - * Acquire the handle for a particular device, based on its index. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or UUID. See - * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. - * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't - * need to worry about that. - * - * @param index The index of the target GPU, >= 0 and < \a accessibleDevices - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetIndex - * @see nvmlDeviceGetCount - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its board serial number. - * - * For Fermi &tm; or newer fully supported devices. - * - * This number corresponds to the value printed directly on the board, and to the value returned by - * \ref nvmlDeviceGetSerial(). - * - * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor - * of \ref nvmlDeviceGetHandleByUUID. - * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @param serial The board serial number of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one - * device has the same serial (dual GPU boards) - * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetSerial - * @see nvmlDeviceGetHandleByUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. - * - * For all products. - * - * @param uuid The UUID of the target GPU - * @param device Reference in which to return the device handle - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null - * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its PCI bus id. - * - * For all products. - * - * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo(). - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND - * instead of NVML_ERROR_NO_PERMISSION. - * - * @param pciBusId The PCI bus id of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); - -/** - * Retrieves the name of this device. - * - * For all products. - * - * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not - * exceed 64 characters in length (including the NULL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param name Reference in which to return the product name - * @param length The maximum allowed length of the string returned in \a name - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); - -/** - * Retrieves the brand of this device. - * - * For all products. - * - * The type is a member of \ref nvmlBrandType_t defined above. - * - * @param device The identifier of the target device - * @param type Reference in which to return the product brand type - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); - -/** - * Retrieves the NVML index of this device. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * @param device The identifier of the target device - * @param index Reference in which to return the NVML index of the device - * - * @return - * - \ref NVML_SUCCESS if \a index has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetHandleByIndex() - * @see nvmlDeviceGetCount() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); - -/** - * Retrieves the globally unique board serial number associated with this device's board. - * - * For all products with an inforom. - * - * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). - * This number matches the serial number tag that is physically attached to the board. See \ref - * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param serial Reference in which to return the board/module serial number - * @param length The maximum allowed length of the string returned in \a serial - * - * @return - * - \ref NVML_SUCCESS if \a serial has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); - -/** - * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device - * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, - * result[0] = 0x3, result[1] = 0x3 - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param cpuSetSize The size of the cpuSet array that is safe to access - * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per - * unsigned long on 64-bit machines, 32 on 32-bit machines - * - * @return - * - \ref NVML_SUCCESS if \a cpuAffinity has been filled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); - -/** - * Sets the ideal affinity for the calling thread and device using the guidelines - * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. - * Older versions set the affinity for a calling process and all children. - * Currently supports up to 64 processors. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully bound - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); - -/** - * Clear all affinity bindings for the calling thread. Note, this is a change as of version - * 8.0 as older versions cleared the affinity for a calling process and all children. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully unbound - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); - -/** - * Retrieve the common ancestor for two devices - * For all products. - * Supported on Linux only. - * - * @param device1 The identifier of the first device - * @param device2 The identifier of the second device - * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type - * - * @return - * - \ref NVML_SUCCESS if \a pathInfo has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); - -/** - * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level - * For all products. - * Supported on Linux only. - * - * @param device The identifier of the first device - * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found at \a level - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the set of GPUs that have a CPU affinity with the given CPU number - * For all products. - * Supported on Linux only. - * - * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the status for a given p2p capability index between a given pair of GPU - * - * @param device1 The first device - * @param device2 The second device - * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 - * @param p2pStatus Reference in which to return the status of the \a p2pIndex - * between \a device1 and \a device2 - * @return - * - \ref NVML_SUCCESS if \a p2pStatus has been populated - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); - -/** - * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, - * that augments the immutable, board serial identifier. - * - * For all products. - * - * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. - * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param uuid Reference in which to return the GPU UUID - * @param length The maximum allowed length of the string returned in \a uuid - * - * @return - * - \ref NVML_SUCCESS if \a uuid has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); - -/** - * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for - * each GPU will have the form /dev/nvidia[minor number]. - * - * For all products. - * Supported only for Linux - * - * @param device The identifier of the target device - * @param minorNumber Reference in which to return the minor number for the device - * @return - * - \ref NVML_SUCCESS if the minor number is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); - -/** - * Retrieves the the device board part number which is programmed into the board's InfoROM - * - * For all products. - * - * @param device Identifier of the target device - * @param partNumber Reference to the buffer to return - * @param length Length of the buffer reference - * - * @return - * - \ref NVML_SUCCESS if \a partNumber has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); - -/** - * Retrieves the version information for the device's infoROM object. - * - * For all products with an inforom. - * - * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate - * ECC counts. The version of the data structures in this memory may change from time to time. It will not - * exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * See \ref nvmlInforomObject_t for details on the available infoROM objects. - * - * @param device The identifier of the target device - * @param object The target infoROM object - * @param version Reference in which to return the infoROM version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomImageVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); - -/** - * Retrieves the global infoROM image version - * - * For all products with an inforom. - * - * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board - * in contrast to infoROM object version which is only an indicator of supported features. - * Version string will not exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference in which to return the infoROM image version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Retrieves the checksum of the configuration stored in the device's infoROM. - * - * For all products with an inforom. - * - * Can be used to make sure that two GPUs have the exact same configuration. - * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. - * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) - * - * @param device The identifier of the target device - * @param checksum Reference in which to return the infoROM configuration checksum - * - * @return - * - \ref NVML_SUCCESS if \a checksum has been set - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); - -/** - * Reads the infoROM from the flash and verifies the checksums. - * - * For all products with an inforom. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if infoROM is not corrupted - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); - -/** - * Retrieves the display mode for the device. - * - * For all products. - * - * This method indicates whether a physical display (e.g. monitor) is currently connected to - * any of the device's connectors. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param display Reference in which to return the display mode - * - * @return - * - \ref NVML_SUCCESS if \a display has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); - -/** - * Retrieves the display active state for the device. - * - * For all products. - * - * This method indicates whether a display is initialized on the device. - * For example whether X Server is attached to this device and has allocated memory for the screen. - * - * Display can be active even when no monitor is physically attached. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param isActive Reference in which to return the display active state - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); - -/** - * Retrieves the persistence mode associated with this device. - * - * For all products. - * For Linux only. - * - * When driver persistence mode is enabled the driver software state is not torn down when the last - * client disconnects. By default this feature is disabled. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current driver persistence mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the PCI attributes of this device. - * - * For all products. - * - * See \ref nvmlPciInfo_t for details on the available PCI info. - * - * @param device The identifier of the target device - * @param pci Reference in which to return the PCI info - * - * @return - * - \ref NVML_SUCCESS if \a pci has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); - -/** - * Retrieves the maximum PCIe link generation possible with this device and system - * - * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will - * report is generation 1. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkGen Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); - -/** - * Retrieves the maximum PCIe link width possible with this device and system - * - * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report - * a max link width of 8. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkWidth Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); - -/** - * Retrieves the current PCIe link generation - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkGen Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); - -/** - * Retrieves the current PCIe link width - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkWidth Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); - -/** - * Retrieve PCIe utilization information. - * This function is querying a byte counter over a 20ms interval and thus is the - * PCIe throughput over that interval. - * - * For Maxwell &tm; or newer fully supported devices. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t - * @param value Reference in which to return throughput in KB/s - * - * @return - * - \ref NVML_SUCCESS if \a value has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); - -/** - * Retrieve the PCIe replay counter. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param value Reference in which to return the counter's value - * - * @return - * - \ref NVML_SUCCESS if \a value and \a rollover have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); - -/** - * Retrieves the current clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the maximum clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks - * by few MHz. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. - * Can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the default applications clock that GPU boots with or - * defaults to after \ref nvmlDeviceResetApplicationsClocks call. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the default clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * \see nvmlDeviceGetApplicationsClock - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. - * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); - -/** - * Retrieves the clock speed for the clock specified by the clock type and clock ID. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockId Identify which clock in the domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); - -/** - * Retrieves the customer defined maximum boost clock speed specified by the given clock type. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of - * required elements) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedGraphicsClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param memoryClockMHz Memory clock for which to return possible graphics clocks - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clocks in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedMemoryClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. - * - * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device - * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will - * revert to when no applications are using the GPU - * - * @return - * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); - -/** - * Try to set the current state of Auto Boosted clocks on a device. - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * Non-root users may use this API by default but can be restricted by root from using this API by calling - * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. - * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set Auto Boosted clocks of the target device to - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); - -/** - * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will - * return to when no compute running processes (e.g. CUDA application which have an active context) are running - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set default Auto Boosted clocks of the target device to - * @param flags Flags that change the default behavior. Currently Unused. - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); - - -/** - * Retrieves the intended operating speed of the device's fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. - * - * @param device The identifier of the target device - * @param speed Reference in which to return the fan speed percentage - * - * @return - * - \ref NVML_SUCCESS if \a speed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); - -/** - * Retrieves the current temperature readings for the device, in degrees C. - * - * For all products. - * - * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. - * - * @param device The identifier of the target device - * @param sensorType Flag that indicates which sensor reading to retrieve - * @param temp Reference in which to return the temperature reading - * - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); - -/** - * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. - * - * @param device The identifier of the target device - * @param thresholdType The type of threshold value queried - * @param temp Reference in which to return the temperature reading - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); - -/** - * Retrieves the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * Retrieves current clocks throttling reasons. - * - * For all fully supported products. - * - * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. - * - * @param device The identifier of the target device - * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle - * reasons - * - * @return - * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetSupportedClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); - -/** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by - * \ref nvmlDeviceGetCurrentClocksThrottleReasons - * - * For all fully supported products. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported - * clocks throttle reasons - * - * @return - * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetCurrentClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); - -/** - * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. - * - * Retrieve the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * This API has been deprecated. - * - * Retrieves the power management mode associated with this device. - * - * For products from the Fermi family. - * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. - * - * For from the Kepler or newer families. - * - Does not require \a NVML_INFOROM_POWER object. - * - * This flag indicates whether any power management algorithm is currently active on the device. An - * enabled state does not necessarily mean the device is being actively throttled -- only that - * that the driver will do so if the appropriate conditions are met. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current power management mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the power management limit associated with this device. - * - * For Fermi &tm; or newer fully supported devices. - * - * The power limit defines the upper boundary for the card's power draw. If - * the card's total power draw reaches this limit the power management algorithm kicks in. - * - * This reading is only available if power management mode is supported. - * See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves information about possible values of power management limits on this device. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param minLimit Reference in which to return the minimum power management limit in milliwatts - * @param maxLimit Reference in which to return the maximum power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPowerManagementLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); - -/** - * Retrieves default power management limit on this device, in milliwatts. - * Default power management limit is a power management limit that the device boots with. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param defaultLimit Reference in which to return the default power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a defaultLimit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); - -/** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) - * - * For Fermi &tm; or newer fully supported devices. - * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. - * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param power Reference in which to return the power usage information - * - * @return - * - \ref NVML_SUCCESS if \a power has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); - -/** - * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded - * - * For newer than Pascal &tm; fully supported devices. - * - * @param device The identifier of the target device - * @param energy Reference in which to return the energy consumption information - * - * @return - * - \ref NVML_SUCCESS if \a energy has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); - -/** - * Get the effective power limit that the driver enforces after taking into account all limiters - * - * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere - * This includes the out of band power limit interface - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The device to communicate with - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current GOM - * @param pending Reference in which to return the pending GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceSetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); - -/** - * Retrieves the amount of used, free and total memory available on the device, in bytes. - * - * For all products. - * - * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. - * Under WDDM most device memory is allocated and managed on startup by Windows. - * - * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated - * by all active channels on the device. - * - * See \ref nvmlMemory_t for details on available memory info. - * - * @param device The identifier of the target device - * @param memory Reference in which to return the memory information - * - * @return - * - \ref NVML_SUCCESS if \a memory has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); - -/** - * Retrieves the current compute mode for the device. - * - * For all products. - * - * See \ref nvmlComputeMode_t for details on allowed compute modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current compute mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); - -/** - * Retrieves the CUDA compute capability of the device. - * - * For all products. - * - * Returns the major and minor compute capability version numbers of the - * device. The major and minor versions are equivalent to the - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be - * returned by CUDA's cuDeviceGetAttribute(). - * - * @param device The identifier of the target device - * @param major Reference in which to return the major CUDA compute capability - * @param minor Reference in which to return the minor CUDA compute capability - * - * @return - * - \ref NVML_SUCCESS if \a major and \a minor have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); - -/** - * Retrieves the current and pending ECC modes for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * - * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following - * the next reboot. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current ECC mode - * @param pending Reference in which to return the pending ECC mode - * - * @return - * - \ref NVML_SUCCESS if \a current and \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); - -/** - * Retrieves the device boardId from 0-N. - * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with - * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. - * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across - * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and - * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will - * always return those values but they will always be different from each other). - * - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param boardId Reference in which to return the device's board ID - * - * @return - * - \ref NVML_SUCCESS if \a boardId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); - -/** - * Retrieves whether the device is on a Multi-GPU Board - * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param multiGpuBool Reference in which to return a zero or non-zero value - * to indicate whether the device is on a multi GPU board - * - * @return - * - \ref NVML_SUCCESS if \a multiGpuBool has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); - -/** - * Retrieves the total ECC error counts for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires ECC Mode to be enabled. - * - * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of - * errors across the entire device. - * - * See \ref nvmlMemoryErrorType_t for a description of available error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); - -/** - * Retrieves the detailed ECC error counts for the device. - * - * @deprecated This API supports only a fixed set of ECC error locations - * On different GPU architectures different locations are supported - * See \ref nvmlDeviceGetMemoryErrorCounter - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. - * Requires ECC Mode to be enabled. - * - * Detailed errors provide separate ECC counts for specific parts of the memory system. - * - * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. - * - * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); - -/** - * Retrieves the requested memory error counter for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. - * - * Only applicable to devices with ECC. - * - * Requires ECC Mode to be enabled. - * - * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of error. - * @param counterType Flag that specifies the counter-type of the errors. - * @param locationType Specifies the location of the counter. - * @param count Reference in which to return the ECC counter - * - * @return - * - \ref NVML_SUCCESS if \a count has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is - * invalid, or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, - nvmlEccCounterType_t counterType, - nvmlMemoryLocation_t locationType, unsigned long long *count); - -/** - * Retrieves the current utilization rates for the device's major subsystems. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlUtilization_t for details on available utilization rates. - * - * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. - * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. - * - * @param device The identifier of the target device - * @param utilization Reference in which to return the utilization information - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Encoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for encoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** - * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param encoderQueryType Type of encoder to query - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); - -/** - * Retrieves the current encoder statistics for a given device. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, - * or \a averageLatency is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about active encoder sessions on a target device. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. - * @param sessionInfos Reference in which to return the session information - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfos is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Decoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for decoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** - * Retrieves the current and pending driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current driver model - * @param pending Reference in which to return the pending driver model - * - * @return - * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); - -/** - * Get VBIOS version of the device. - * - * For all products. - * - * The VBIOS version may change from time to time. It will not exceed 32 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference to which to return the VBIOS version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Get Bridge Chip Information for all the bridge chips on the board. - * - * For all fully supported products. - * Only applicable to multi-GPU products. - * - * @param device The identifier of the target device - * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy - * - * @return - * - \ref NVML_SUCCESS if bridge chip exists - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); - -/** - * Get information about processes with a compute context on a device - * - * For Fermi &tm; or newer fully supported devices. - * - * This function returns information only about compute running processes (e.g. CUDA application which have - * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. - * - * To query the current number of running compute processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new compute processes are spawned. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Get information about processes with a graphics context on a device - * - * For Kepler &tm; or newer fully supported devices. - * - * This function returns information only about graphics based processes - * (eg. applications using OpenGL, DirectX) - * - * To query the current number of running graphics processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new graphics processes are spawned. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Check if the GPU devices are on the same physical board. - * - * For all fully supported products. - * - * @param device1 The first GPU device - * @param device2 The second GPU device - * @param onSameBoard Reference in which to return the status. - * Non-zero indicates that the GPUs are on the same board. - * - * @return - * - \ref NVML_SUCCESS if \a onSameBoard has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); - -/** - * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. - * - * For all fully supported products. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted Reference in which to return the current restriction - * NVML_FEATURE_ENABLED indicates that the API is root-only - * NVML_FEATURE_DISABLED indicates that the API is accessible to all users - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support - * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is - * not supported by the device) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); - -/** - * Gets recent samples for the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by - * the driver. - * - * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. - * - * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. - * The returned samplesCount will provide the number of samples that can be queried. The user needs to - * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). - * - * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the - * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query - * to get more recent samples. - * - * This method fetches the number of entries which can be accommodated in the provided samples array, and the - * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this - * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. - * - * @param device The identifier for the target device - * @param type Type of sampling event - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t - * @param sampleCount Reference to provide the number of elements which can be queried in samples array - * @param samples Reference in which samples are returned - - * @return - * - \ref NVML_SUCCESS if samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or - * reference to \a sampleCount is 0 for non null \a samples - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); - -/** - * Gets Total, Available and Used size of BAR1 memory. - * - * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party - * devices (peer-to-peer on the PCIE bus). - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param bar1Memory Reference in which BAR1 memory - * information is returned. - * - * @return - * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); - - -/** - * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power - * or thermal constraints. - * - * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The - * difference in violation times at two different reference times gives the indication of GPU throttling event. - * - * Violation for thermal capping is not supported at this time. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param perfPolicyType Represents Performance policy which can trigger GPU throttling - * @param violTime Reference to which violation time related information is returned - * - * - * @return - * - \ref NVML_SUCCESS if violation time is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Queries the state of per process accounting mode. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlDeviceGetAccountingStats for more details. - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Queries process's accounting stats. - * - * For Kepler &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. - * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. - * - * @param device The identifier of the target device - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL - * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); - -/** - * Queries list of processes that can be queried for accounting stats. The list of processes returned - * can be in running or terminated state. - * - * For Kepler &tm; or newer fully supported devices. - * - * To just query the number of processes ready to be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlDeviceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to - * expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); - -/** - * Returns the number of processes that the circular buffer with accounting pids can hold. - * - * For Kepler &tm; or newer fully supported devices. - * - * This is the maximum number of processes that accounting information will be stored for before information - * about oldest processes will get overwritten by information about new processes. - * - * @param device The identifier of the target device - * @param bufferSize Reference in which to provide the size (in number of elements) - * of the circular buffer for accounting stats. - * - * @return - * - \ref NVML_SUCCESS if buffer size was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingStats - * @see nvmlDeviceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); - -/** @} */ - -/** @addtogroup nvmlDeviceQueries - * @{ - */ - -/** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * - * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses); - -/** - * Check if any pages are pending retirement and need a reboot to fully retire. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param isPending Reference in which to return the pending status - * - * @return - * - \ref NVML_SUCCESS if \a isPending was populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitCommands Unit Commands - * This chapter describes NVML operations that change the state of the unit. For S-class products. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the LED state for the unit. The LED can be either green (0) or amber (1). - * - * For S-class products. - * Requires root/admin permissions. - * - * This operation takes effect immediately. - * - * - * Current S-Class products don't provide unique LEDs for each unit. As such, both front - * and back LEDs will be toggled in unison regardless of which unit is specified with this command. - * - * See \ref nvmlLedColor_t for available colors. - * - * @param unit The identifier of the target unit - * @param color The target LED color - * - * @return - * - \ref NVML_SUCCESS if the LED color has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitGetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceCommands Device Commands - * This chapter describes NVML operations that change the state of the device. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the persistence mode for the device. - * - * For all products. - * For Linux only. - * Requires root/admin permissions. - * - * The persistence mode determines whether the GPU driver software is torn down after the last client - * exits. - * - * This operation takes effect immediately. It is not persistent across reboots. After each reboot the - * persistence mode is reset to "Disabled". - * - * See \ref nvmlEnableState_t for available modes. - * - * @param device The identifier of the target device - * @param mode The target persistence mode - * - * @return - * - \ref NVML_SUCCESS if the persistence mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Set the compute mode for the device. - * - * For all products. - * Requires root/admin permissions. - * - * The compute mode determines whether a GPU can be used for compute operations and whether it can - * be shared across contexts. - * - * This operation takes effect immediately. Under Linux it is not persistent across reboots and - * always resets to "Default". Under windows it is persistent. - * - * Under windows compute mode may only be set to DEFAULT when running in WDDM - * - * See \ref nvmlComputeMode_t for details on available compute modes. - * - * @param device The identifier of the target device - * @param mode The target compute mode - * - * @return - * - \ref NVML_SUCCESS if the compute mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); - -/** - * Set the ECC mode for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires root/admin permissions. - * - * The ECC mode determines whether the GPU enables its ECC support. - * - * This operation takes effect after the next reboot. - * - * See \ref nvmlEnableState_t for details on available modes. - * - * @param device The identifier of the target device - * @param ecc The target ECC mode - * - * @return - * - \ref NVML_SUCCESS if the ECC mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); - -/** - * Clear the ECC error and other memory error counts for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. - * Requires root/admin permissions. - * Requires ECC Mode to be enabled. - * - * Sets all of the specified ECC counters to 0, including both detailed and total counts. - * - * This operation takes effect immediately. - * - * See \ref nvmlMemoryErrorType_t for details on available counter types. - * - * @param device The identifier of the target device - * @param counterType Flag that indicates which type of errors should be cleared. - * - * @return - * - \ref NVML_SUCCESS if the error counts were cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see - * - nvmlDeviceGetDetailedEccErrors() - * - nvmlDeviceGetTotalEccErrors() - */ -nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); - -/** - * Set the driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * Requires root/admin permissions. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. - * - * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). - * This should only be done if the host is subsequently powered down and the display is detached from the device - * before the next reboot. - * - * This operation takes effect after the next reboot. - * - * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. - * - * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or - * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * See \ref nvmlFlagDefault and \ref nvmlFlagForce - * - * @param device The identifier of the target device - * @param driverModel The target driver model - * @param flags Flags that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if the driver model has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); - -/** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). - * - * Can be used as a setting to request constant performance. - * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. - * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. - * - * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); - -/** - * Set new power limit of this device. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. - * - * \note Limit is not persistent across reboots or driver unloads. - * Enable persistent mode to prevent driver from unloading when no application is using the device. - * - * @param device The identifier of the target device - * @param limit Power management limit in milliwatts to set - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPowerManagementLimitConstraints - * @see nvmlDeviceGetPowerManagementDefaultLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); - -/** - * Sets new GOM. See \a nvmlGpuOperationMode_t for details. - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * Requires root/admin permissions. - * - * Changing GOMs requires a reboot. - * The reboot requirement might be removed in the future. - * - * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when - * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. - * - * @param device The identifier of the target device - * @param mode Target GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceGetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); - -/** - * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. - * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction - * to query the current restriction settings. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted The target restriction - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support - * the feature that api restrictions are being set for (E.G. Enabling/disabling auto - * boosted clocks is not supported by the device) - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Enables or disables per process accounting. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @note This setting is not persistent and will default to disabled after driver unloads. - * Enable persistence mode to be sure the setting doesn't switch off to disabled. - * - * @note Enabling accounting mode has no negative impact on the GPU performance. - * - * @note Disabling accounting clears all accounting pids information. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceClearAccountingPids - * - * @param device The identifier of the target device - * @param mode The target accounting mode - * - * @return - * - \ref NVML_SUCCESS if the new mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Clears accounting information about all processes that have already terminated. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup NvLink NvLink Methods - * This chapter describes methods that NVML can perform on NVLINK enabled devices. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves the state of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that - * the link is active and NVML_FEATURE_DISABLED indicates it - * is inactive - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); - -/** - * Retrieves the version of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param version Requested NvLink version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); - -/** - * Retrieves the requested capability from the device's NvLink for the link specified - * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried - * The return value should be treated as a boolean. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried - * @param capResult A boolean for the queried capability indicating that feature is available - * - * @return - * - \ref NVML_SUCCESS if \a capResult has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); - -/** - * Retrieves the PCI information for the remote node on a NvLink link - * Note: pciSubSystemId is not filled in this function and is indeterminate - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param pci \a nvmlPciInfo_t of the remote node for the specified link - * - * @return - * - \ref NVML_SUCCESS if \a pci has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); - -/** - * Retrieves the specified error counter value - * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the NvLink counter to be queried - * @param counterValue Returned counter value - * - * @return - * - \ref NVML_SUCCESS if \a counter has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, - nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); - -/** - * Resets all error counters to zero - * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * - * @return - * - \ref NVML_SUCCESS if the reset is successful - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); - -/** - * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset - * of the counters if the reset parameter is non-zero. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set - * @param reset Resets the counters on set if non-zero - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control, unsigned int reset); - -/** - * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control); - - -/** - * Retrieve the NVLINK utilization counter based on the current control for a specified counter. - * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl - * before reading the utilization counters as they have no default state - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be read (0 or 1). - * @param rxcounter Receive counter return value - * @param txcounter Transmit counter return value - * - * @return - * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, - unsigned long long *rxcounter, unsigned long long *txcounter); - -/** - * Freeze the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be frozen (0 or 1). - * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters - * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters - * - * @return - * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, - unsigned int counter, nvmlEnableState_t freeze); - -/** - * Reset the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be reset - * @param counter Specifies the counter that should be reset (0 or 1) - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEvents Event Handling Methods - * This chapter describes methods that NVML can perform against each device to register and wait for - * some event to occur. - * @{ - */ -/***************************************************************************************************/ - -/** - * Create an empty set of events. - * Event set should be freed by \ref nvmlEventSetFree - * - * For Fermi &tm; or newer fully supported devices. - * @param set Reference in which to return the event handle - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); - -/** - * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t - * - * For Fermi &tm; or newer fully supported devices. - * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) - * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) - * - * For Linux only. - * - * \b IMPORTANT: Operations on \a set are not thread safe - * - * This call starts recording of events on specific device. - * All events that occurred before this call are not recorded. - * Checking if some event occurred can be done with \ref nvmlEventSetWait - * - * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. - * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes - * are registered in that case. - * - * @param device The identifier of the target device - * @param eventTypes Bitmask of \ref nvmlEventType to record - * @param set Set to which add new event types - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceGetSupportedEventTypes - * @see nvmlEventSetWait - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); - -/** - * Returns information about events supported on device - * - * For Fermi &tm; or newer fully supported devices. - * - * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. - * - * @param device The identifier of the target device - * @param eventTypes Reference in which to return bitmask of supported events - * - * @return - * - \ref NVML_SUCCESS if the eventTypes has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); - -/** - * Waits on events and delivers events - * - * For Fermi &tm; or newer fully supported devices. - * - * If some events are ready to be delivered at the time of the call, function returns immediately. - * If there are no events ready to be delivered, function sleeps till event arrives - * but not longer than specified timeout. This function in certain conditions can return before - * specified timeout passes (e.g. when interrupt arrives) - * - * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple - * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all - * xid error events. - * - * @param set Reference to set of events to wait on - * @param data Reference in which to return event data - * @param timeoutms Maximum amount of wait time in milliseconds for registered event - * - * @return - * - \ref NVML_SUCCESS if the data has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL - * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived - * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); - -/** - * Releases events in the set - * - * For Fermi &tm; or newer fully supported devices. - * - * @param set Reference to events to be released - * - * @return - * - \ref NVML_SUCCESS if the event has been successfully released - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlZPI Drain states - * This chapter describes methods that NVML can perform against each device to control their drain state - * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to - * power on/off GPUs, enable robust reset scenarios, etc. - * @{ - */ -/***************************************************************************************************/ - -/** - * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. - * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before - * this call is made. - * Must be called as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be modified - * @param newState The drain state that should be entered, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); - -/** - * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining - * state. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be queried - * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); - -/** - * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver - * as long as no other processes are attached. If other processes are attached, this call will return - * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the - * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called - * to initiate the draining state is if that process was using, and is still using, a GPU before the - * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled - * prior to this call. - * - * For long-running NVML processes please note that this will change the enumeration of current GPUs. - * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. - * Also, device handles after the removed GPU will not be valid and must be re-established. - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU to be removed - * @param gpuState Whether the GPU is to be removed, from the OS - * see \ref nvmlDetachGpuState_t - * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed - */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); - -/** - * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that - * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. - * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes - * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. - * - * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds - * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. - * - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device - * fields are used in this call. - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature - * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFieldValueQueries Field Value Queries - * This chapter describes NVML operations that are associated with retrieving Field Values from NVML - * @{ - */ -/***************************************************************************************************/ - -/** - * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. - * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs - * will be populated from a single call rather than making a driver call for each fieldId. - * - * @param device The device handle of the GPU to request field values for - * @param valuesCount Number of entries in values that should be retrieved - * @param values Array of \a valuesCount structures to hold field values. - * Each value's fieldId must be populated prior to this call - * - * @return - * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must - * check the nvmlReturn field of each value for each individual - * status - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridQueries Grid Queries - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to get the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridCommands Grid Commands - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to set the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. - * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. - */ -nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpu vGPU Management - * @{ - * - * Set of APIs supporting GRID vGPU - */ -/***************************************************************************************************/ - -/** - * Retrieve the supported vGPU types on a physical GPU (device). - * - * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the currently creatable vGPU types on a physical GPU (device). - * - * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types - * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable - * list will be restricted to whatever vGPU type is already running on the device. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeClass Pointer to string array to return class in - * @param size Size of string - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); - -/** - * Retrieve the vGPU type name. - * - * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not - * exceed 64 characters in length (including the NUL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeName Pointer to buffer to return name - * @param size Size of buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); - -/** - * Retrieve the device ID of a vGPU type. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value - * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); - -/** - * Retrieve the vGPU framebuffer size in bytes. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param fbSize Pointer to framebuffer size in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); - -/** - * Retrieve count of vGPU's supported display heads. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param numDisplayHeads Pointer to number of display heads - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); - -/** - * Retrieve vGPU display head's maximum supported resolution. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param displayIndex Zero-based index of display head - * @param xdim Pointer to maximum number of pixels in X dimension - * @param ydim Pointer to maximum number of pixels in Y dimension - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex - * is out of range. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); - -/** - * Retrieve license requirements for a vGPU type - * - * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form - * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, - * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". - * - * The total length of the returned string will not exceed 128 characters, including the NUL terminator. - * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeLicenseString Pointer to buffer to return license info - * @param size Size of \a vgpuTypeLicenseString buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); - -/** - * Retrieve the static frame rate limit value of the vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param frameRateLimit Reference to return the frame rate limit value - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); - -/** - * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCount Pointer to get the max number of vGPU instances - * that can be created on a deicve for given vgpuTypeId - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, - * or \a vgpuInstanceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); - -/** - * Retrieve the active vGPU instances on a device. - * - * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The - * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. - * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return - * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer which passes in the array size as well as get - * back the number of types - * @param vgpuInstances Pointer to array in which to return list of vGPU instances - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); - -/** - * Retrieve the VM ID associated with a vGPU instance. - * - * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vmId Pointer to caller-supplied buffer to hold VM ID - * @param size Size of buffer in bytes - * @param vmIdType Pointer to hold VM ID type - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); - -/** - * Retrieve the UUID of a vGPU instance. - * - * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); - -/** - * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. - * - * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version - * string will not exceed 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is - * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the - * NVIDIA driver is loaded and initialized. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param version Caller-supplied buffer to return driver version string - * @param length Size of \a version buffer - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); - -/** - * Retrieve the framebuffer usage in bytes. - * - * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target instance - * @param fbUsage Pointer to framebuffer usage in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); - -/** - * Retrieve the current licensing state of the vGPU instance. - * - * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param licensed Reference to return the licensing status - * - * @return - * - \ref NVML_SUCCESS if \a licensed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); - -/** - * Retrieve the vGPU type of a vGPU instance. - * - * Returns the vGPU type ID of vgpu assigned to the vGPU instance. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vgpuTypeId Reference to return the vgpuTypeId - * - * @return - * - \ref NVML_SUCCESS if \a vgpuTypeId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); - -/** - * Retrieve the frame rate limit set for the vGPU instance. - * - * Returns the value of the frame rate limit set for the vGPU instance - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param frameRateLimit Reference to return the frame rate limit - * - * @return - * - \ref NVML_SUCCESS if \a frameRateLimit has been set - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); - -/** - * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); - -/** - * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Unsigned int for the encoder capacity value - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); - -/** - * Retrieves current utilization for vGPUs on a physical GPU (device). - * - * For Kepler &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running - * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer - * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the - * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values - * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to - * indicate the returned value type. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate - * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample - * structures that were actually written. This may differ from a previously read value as vGPU instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values - * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is - * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, - nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); - -/** - * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on - * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the - * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running - * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which - * the samples were recorded. Individual utilization values are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size - * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample - * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active - * in any given sample period. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is - * passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - unsigned int *vgpuProcessSamplesCount, - nvmlVgpuProcessUtilizationSample_t *utilizationSamples); -/** - * Retrieve the GRID licensable features. - * - * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. - * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned - * - * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); - -/** - * Retrieves the current encoder statistics of a vGPU Instance - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is invalid. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about all active encoder sessions on a vGPU Instance. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to caller supplied array size, and returns - * the number of sessions. - * @param sessionInfo Reference to caller supplied array in which the list - * of session information us returned. - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfo is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is - returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid.. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); - -/** - * Retrieves the current utilization and process ID - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. - * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvml vGPU Migration - * This chapter describes NVML operations that are associated with vGPU Migration. - * @{ - */ -/***************************************************************************************************/ - -/** - * vGPU metadata structure. - */ -typedef struct nvmlVgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields - char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host - unsigned int reserved[8]; //!< Reserved for internal use - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuMetadata_t; - -/** - * Physical GPU metadata structure - */ -typedef struct nvmlVgpuPgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version - unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld - unsigned int reserved[7]; //!< Reserved for internal use - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuPgpuMetadata_t; - -/** - * vGPU VM compatibility codes - */ -typedef enum nvmlVgpuVmCompatibility_enum -{ - NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable - NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) - NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) - NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) - NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0) -} nvmlVgpuVmCompatibility_t; - -/** - * vGPU-pGPU compatibility limit codes - */ -typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum -{ - NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. - NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. - NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor. -} nvmlVgpuPgpuCompatibilityLimitCode_t; - -/** - * vGPU-pGPU compatibility structure - */ -typedef struct nvmlVgpuPgpuCompatibility_st -{ - nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t - nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t -} nvmlVgpuPgpuCompatibility_t; - -/** - * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM - * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section - * containing internal state. - * - * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are - * dependent on information obtained from the guest VM, which may not yet have reached a state where that information - * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field. - * - * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide - * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. - * - * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure - * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param vgpuInstance vGPU instance handle - * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written - * @param bufferSize Size of vgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is invalid; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); - -/** - * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about - * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section - * containing internal state. - * - * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata - * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param device The identifier of the target device - * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written - * @param bufferSize Pointer to size of \a pgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS GPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); - -/** - * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a - * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the - * physical GPU. - * - * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The - * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility - * with the physical GPU is limited, a limit code indicates the factor limiting compability. - * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). - * - * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to - * boot a given vGPU or associated VM. - * - * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure - * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure - * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); - -/** @} */ - -/** - * NVML API versioning support - */ -#if defined(__NVML_API_VERSION_INTERNAL) -#undef nvmlDeviceRemoveGpu -#undef nvmlDeviceGetNvLinkRemotePciInfo -#undef nvmlDeviceGetPciInfo -#undef nvmlDeviceGetCount -#undef nvmlDeviceGetHandleByIndex -#undef nvmlDeviceGetHandleByPciBusId -#undef nvmlInit -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c deleted file mode 100644 index a3d162c0e1b..00000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -#include -#include - -#include "nvml_dl.h" - -#define DLSYM(x, sym) \ -do { \ - dlerror(); \ - x = dlsym(handle, #sym); \ - if (dlerror() != NULL) { \ - return (NVML_ERROR_FUNCTION_NOT_FOUND); \ - } \ -} while (0) - -typedef nvmlReturn_t (*nvmlSym_t)(); - -static void *handle; - -nvmlReturn_t NVML_DL(nvmlInit)(void) -{ - handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL); - if (handle == NULL) { - return (NVML_ERROR_LIBRARY_NOT_FOUND); - } - return (nvmlInit()); -} - -nvmlReturn_t NVML_DL(nvmlShutdown)(void) -{ - nvmlReturn_t r = nvmlShutdown(); - if (r != NVML_SUCCESS) { - return (r); - } - return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS); -} - -nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)( - nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info) -{ - nvmlSym_t sym; - - DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor); - return ((*sym)(dev1, dev2, info)); -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h deleted file mode 100644 index 628f0b3a2c2..00000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -#ifndef _NVML_DL_H_ -#define _NVML_DL_H_ - -#include "nvml.h" - -#define NVML_DL(x) x##_dl - -extern nvmlReturn_t NVML_DL(nvmlInit)(void); -extern nvmlReturn_t NVML_DL(nvmlShutdown)(void); -extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)( - nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *); - -#endif // _NVML_DL_H_ diff --git a/vendor/modules.txt b/vendor/modules.txt index 4dfe88ee06b..6ca76bfd318 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -52,9 +52,6 @@ github.com/Microsoft/go-winio/pkg/guid # github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 ## explicit github.com/Microsoft/hcsshim/osversion -# github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 -## explicit -github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml # github.com/NYTimes/gziphandler v1.0.1 => github.com/NYTimes/gziphandler v1.0.0 ## explicit github.com/NYTimes/gziphandler diff --git a/website/content/docs/devices/external/index.mdx b/website/content/docs/devices/external/index.mdx index 1a13cc2882e..76ae4d726a9 100644 --- a/website/content/docs/devices/external/index.mdx +++ b/website/content/docs/devices/external/index.mdx @@ -1,30 +1,29 @@ --- layout: docs -page_title: 'Device Plugins: Community Supported' -description: A list of community supported Device Plugins. +page_title: 'Device Plugins: External' +description: 'A list of external Device Plugins.' --- -# Community Supported - -If you have authored a device plugin that you believe will be useful to the -broader Nomad community and you are committed to maintaining the plugin, please -file a PR to add your plugin to this page. - -## Device Plugins +# External Device Plugins Nomad has a plugin system for defining task drivers. External device driver plugins will have the same user experience as built in devices. -Below is a list of community-supported task drivers you can use with Nomad: +Below is a list of official external task drivers you can use with Nomad: -- [USB][usb] +- [Nvidia][nvidia] -## Authoring Device Plugins +## Community Supported -Nomad has a plugin system for defining device drivers. External device plugins -will have the same user experience as built in drivers. For details on -authoring a device plugin, please refer to the [plugin authoring -guide][plugin_guide]. +If you have authored a device plugin that you believe will be useful to the +broader Nomad community and you are committed to maintaining the plugin, +please file a PR to add your plugin to this page. For details on authoring a +device plugin, please refer to the [plugin authoring guide][plugin_guide]. + +Below is a list of community-support task drivers you can use with Nomad: + +- [USB][usb] [plugin_guide]: /docs/internals/plugins +[nvidia]: /docs/devices/external/nvidia [usb]: /docs/devices/external/usb diff --git a/website/content/docs/devices/nvidia.mdx b/website/content/docs/devices/external/nvidia.mdx similarity index 100% rename from website/content/docs/devices/nvidia.mdx rename to website/content/docs/devices/external/nvidia.mdx diff --git a/website/content/docs/devices/index.mdx b/website/content/docs/devices/index.mdx index a342d8f4be0..28c5e58893e 100644 --- a/website/content/docs/devices/index.mdx +++ b/website/content/docs/devices/index.mdx @@ -6,18 +6,13 @@ description: Device Plugins are used to expose devices to tasks in Nomad. # Device Plugins -Device plugins are used to detect and make devices available to tasks in Nomad. -Devices are physical hardware that exists on a node such as a GPU or an FPGA. By -having extensible device plugins, Nomad has the flexibility to support a broad -set of devices and allows the community to build additional device plugins as -needed. +Device plugins are used to detect and make devices available to tasks in +Nomad. Devices are physical hardware that exists on a client node such as a +GPU or an FPGA. By having extensible device plugins, Nomad has the flexibility +to support a broad set of devices and allows the community to build additional +device plugins as needed. The list of supported device plugins is provided on the left of this page. Each device plugin documents its configuration and installation requirements, the attributes it fingerprints, and the environment variables it exposes to tasks. - -For details on authoring a device plugin, please refer to the [plugin authoring -guide][plugin_guide]. - -[plugin_guide]: /docs/internals/plugins diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 1751a892208..1e9993dea0e 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -1438,16 +1438,16 @@ "path": "devices" }, { - "title": "Nvidia", - "path": "devices/nvidia" - }, - { - "title": "Community", + "title": "External", "routes": [ { "title": "Overview", "path": "devices/external" }, + { + "title": "Nvidia", + "path": "devices/external/nvidia" + }, { "title": "USB Beta", "path": "devices/external/usb" @@ -1760,7 +1760,7 @@ { "title": "Overview", "path": "enterprise" - }, + }, { "title": "License", "routes": [