Skip to content

Commit

Permalink
Fix GetComputeRunningProcesses on CUDA 10.x
Browse files Browse the repository at this point in the history
Due to additional fields added to the nvmlProcessInfo_st in CUDA 11.x there
is a mismatch in the size of the elements of the slice returned by the
NVML call to get the running Compute / Graphics processes. This change
attempts to detect whether the returned slice needs to be reinterpreted
as a slice of smaller elements.

Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Aug 12, 2021
1 parent 10a3a25 commit d566199
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 8 deletions.
39 changes: 39 additions & 0 deletions gen/nvml/cgo_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
package nvml

import (
"bytes"
"encoding/binary"
"fmt"
"unsafe"
)

Expand Down Expand Up @@ -62,3 +65,39 @@ func unpackPCharString(str string) (*C.char, *struct{}) {
h := (*stringHeader)(unsafe.Pointer(&str))
return (*C.char)(h.Data), cgoAllocsUnknown
}

// adjustProcessInfoSlice can be used to adjust a ProcessInfo slice to account for
// differences in the structure across multiple NVML versions. This handles fields that
// were added across versions, for example.
func adjustProcessInfoSlice(in []ProcessInfo) ([]ProcessInfo, error) {
type v1ProcessInfo struct {
pid uint32
usedGpuMemory uint64
}

b := &bytes.Buffer{}
err := binary.Write(b, binary.LittleEndian, in)
if err != nil {
return nil, fmt.Errorf("error creating temporary buffer: %v", err)
}

intermediate := make([]v1ProcessInfo, len(in)*2)
err = binary.Read(b, binary.LittleEndian, intermediate)
if err != nil {
return nil, fmt.Errorf("error reading intermediate values: %v", err)
}

var out []ProcessInfo
for i := range in {
pin := intermediate[i]

pout := ProcessInfo{
Pid: pin.pid,
UsedGpuMemory: pin.usedGpuMemory,
}

out = append(out, pout)
}

return out, nil
}
36 changes: 32 additions & 4 deletions gen/nvml/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -933,18 +933,32 @@ func (Device Device) GetBridgeChipInfo() (BridgeChipHierarchy, Return) {

// nvml.DeviceGetComputeRunningProcesses()
func DeviceGetComputeRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo
var InfoCount uint32 = 1 // Will be reduced upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
Infos = make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetComputeRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
break
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
if usesNvmlDeviceGetComputeRunningProcesses_v1 {
// in the case of the _v1 API we need to adjust the size of the process info data structure
adjusted, err := adjustProcessInfoSlice(Infos[:InfoCount])
if err != nil {
return nil, ERROR_UNKNOWN
}
return adjusted, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {
Expand All @@ -953,18 +967,32 @@ func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {

// nvml.DeviceGetGraphicsRunningProcesses()
func DeviceGetGraphicsRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo
var InfoCount uint32 = 1 // Will be reduced upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
Infos = make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetGraphicsRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
break
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
if usesNvmlDeviceGetGraphicsRunningProcesses_v1 {
// in the case of the _v1 API we need to adjust the size of the process info data structure
adjusted, err := adjustProcessInfoSlice(Infos[:InfoCount])
if err != nil {
return nil, ERROR_UNKNOWN
}
return adjusted, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetGraphicsRunningProcesses() ([]ProcessInfo, Return) {
Expand Down
5 changes: 5 additions & 0 deletions gen/nvml/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ var nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v1
var nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v1
var nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v1

var usesNvmlDeviceGetComputeRunningProcesses_v1 = true
var usesNvmlDeviceGetGraphicsRunningProcesses_v1 = true

// updateVersionedSymbols()
func updateVersionedSymbols() {
err := nvml.Lookup("nvmlInit_v2")
Expand Down Expand Up @@ -153,10 +156,12 @@ func updateVersionedSymbols() {
err = nvml.Lookup("nvmlDeviceGetComputeRunningProcesses_v2")
if err == nil {
nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v2
usesNvmlDeviceGetComputeRunningProcesses_v1 = false
}
err = nvml.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v2")
if err == nil {
nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v2
usesNvmlDeviceGetGraphicsRunningProcesses_v1 = false
}

}
39 changes: 39 additions & 0 deletions pkg/nvml/cgo_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
package nvml

import (
"bytes"
"encoding/binary"
"fmt"
"unsafe"
)

Expand Down Expand Up @@ -62,3 +65,39 @@ func unpackPCharString(str string) (*C.char, *struct{}) {
h := (*stringHeader)(unsafe.Pointer(&str))
return (*C.char)(h.Data), cgoAllocsUnknown
}

// adjustProcessInfoSlice can be used to adjust a ProcessInfo slice to account for
// differences in the structure across multiple NVML versions. This handles fields that
// were added across versions, for example.
func adjustProcessInfoSlice(in []ProcessInfo) ([]ProcessInfo, error) {
type v1ProcessInfo struct {
pid uint32
usedGpuMemory uint64
}

b := &bytes.Buffer{}
err := binary.Write(b, binary.LittleEndian, in)
if err != nil {
return nil, fmt.Errorf("error creating temporary buffer: %v", err)
}

intermediate := make([]v1ProcessInfo, len(in)*2)
err = binary.Read(b, binary.LittleEndian, intermediate)
if err != nil {
return nil, fmt.Errorf("error reading intermediate values: %v", err)
}

var out []ProcessInfo
for i := range in {
pin := intermediate[i]

pout := ProcessInfo{
Pid: pin.pid,
UsedGpuMemory: pin.usedGpuMemory,
}

out = append(out, pout)
}

return out, nil
}
36 changes: 32 additions & 4 deletions pkg/nvml/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -933,18 +933,32 @@ func (Device Device) GetBridgeChipInfo() (BridgeChipHierarchy, Return) {

// nvml.DeviceGetComputeRunningProcesses()
func DeviceGetComputeRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo
var InfoCount uint32 = 1 // Will be reduced upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
Infos = make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetComputeRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
break
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
if usesNvmlDeviceGetComputeRunningProcesses_v1 {
// in the case of the _v1 API we need to adjust the size of the process info data structure
adjusted, err := adjustProcessInfoSlice(Infos[:InfoCount])
if err != nil {
return nil, ERROR_UNKNOWN
}
return adjusted, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {
Expand All @@ -953,18 +967,32 @@ func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {

// nvml.DeviceGetGraphicsRunningProcesses()
func DeviceGetGraphicsRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo
var InfoCount uint32 = 1 // Will be reduced upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
Infos = make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetGraphicsRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
break
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
if usesNvmlDeviceGetGraphicsRunningProcesses_v1 {
// in the case of the _v1 API we need to adjust the size of the process info data structure
adjusted, err := adjustProcessInfoSlice(Infos[:InfoCount])
if err != nil {
return nil, ERROR_UNKNOWN
}
return adjusted, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetGraphicsRunningProcesses() ([]ProcessInfo, Return) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/nvml/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ var nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v1
var nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v1
var nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v1

var usesNvmlDeviceGetComputeRunningProcesses_v1 = true
var usesNvmlDeviceGetGraphicsRunningProcesses_v1 = true

// updateVersionedSymbols()
func updateVersionedSymbols() {
err := nvml.Lookup("nvmlInit_v2")
Expand Down Expand Up @@ -153,10 +156,12 @@ func updateVersionedSymbols() {
err = nvml.Lookup("nvmlDeviceGetComputeRunningProcesses_v2")
if err == nil {
nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v2
usesNvmlDeviceGetComputeRunningProcesses_v1 = false
}
err = nvml.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v2")
if err == nil {
nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v2
usesNvmlDeviceGetGraphicsRunningProcesses_v1 = false
}

}

0 comments on commit d566199

Please sign in to comment.