Skip to content

Commit

Permalink
Store the typed value directly in dcgmMetric; avoid unsafe.
Browse files Browse the repository at this point in the history
  • Loading branch information
igorpeshansky committed Jul 26, 2024
1 parent 0454c90 commit 22a7484
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 73 deletions.
19 changes: 13 additions & 6 deletions receiver/dcgmreceiver/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ type dcgmClient struct {
type dcgmMetric struct {
timestamp int64
name string
value [4096]byte
value interface{}
}

// Can't pass argument dcgm.mode because it is unexported
Expand Down Expand Up @@ -388,7 +388,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error)
for i := 0; retry && i < client.maxRetries; i++ {
fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs)
if pollErr == nil {
gpuMetrics[gpuIndex], retry = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues)
gpuMetrics[gpuIndex], retry = client.appendMetrics(gpuMetrics[gpuIndex], gpuIndex, fieldValues)
if retry {
client.logger.Warnf("Retrying poll of DCGM daemon for GPU %d; attempt %d", gpuIndex, i+1)
time.Sleep(client.pollingInterval)
Expand All @@ -406,7 +406,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error)
return gpuMetrics, err.Combine()
}

func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) {
func (client *dcgmClient) appendMetrics(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) {
retry = false
for _, fieldValue := range fieldValues {
dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)]
Expand All @@ -419,13 +419,20 @@ func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, f
continue
}

var metricValue interface{}
switch fieldValue.FieldType {
case dcgm.DCGM_FT_DOUBLE:
client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Float64())
value := fieldValue.Float64()
client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, value)
metricValue = value
case dcgm.DCGM_FT_INT64:
client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Int64())
value := fieldValue.Int64()
client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, value)
metricValue = value
default:
metricValue = fieldValue.Value
}
gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, fieldValue.Value})
gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, metricValue})
}

return gpuMetrics, retry
Expand Down
51 changes: 35 additions & 16 deletions receiver/dcgmreceiver/client_gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ func TestCollectGpuProfilingMetrics(t *testing.T) {
after := time.Now().UnixMicro()
assert.Nil(t, err)

asFloat64 := func(metric dcgmMetric) float64 {
require.IsTypef(t, float64(0), metric.value, "Unexpected metric type: %T", metric.value)
value, _ := metric.value.(float64)
return value
}
asInt64 := func(metric dcgmMetric) int64 {
require.IsTypef(t, int64(0), metric.value, "Unexpected metric type: %T", metric.value)
value, _ := metric.value.(int64)
return value
}

seenMetric := make(map[string]bool)
assert.GreaterOrEqual(t, len(deviceMetrics), 0)
assert.LessOrEqual(t, len(deviceMetrics), 32)
Expand All @@ -172,25 +183,28 @@ func TestCollectGpuProfilingMetrics(t *testing.T) {
case "DCGM_FI_PROF_PIPE_FP16_ACTIVE":
fallthrough
case "DCGM_FI_PROF_DRAM_ACTIVE":
assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0))
assert.LessOrEqual(t, metric.asFloat64(), float64(1.0))
value := asFloat64(metric)
assert.GreaterOrEqual(t, value, float64(0.0))
assert.LessOrEqual(t, value, float64(1.0))
case "DCGM_FI_DEV_GPU_UTIL":
fallthrough
case "DCGM_FI_DEV_MEM_COPY_UTIL":
fallthrough
case "DCGM_FI_DEV_ENC_UTIL":
fallthrough
case "DCGM_FI_DEV_DEC_UTIL":
assert.GreaterOrEqual(t, metric.asInt64(), int64(0))
assert.LessOrEqual(t, metric.asInt64(), int64(100))
value := asInt64(metric)
assert.GreaterOrEqual(t, value, int64(0))
assert.LessOrEqual(t, value, int64(100))
case "DCGM_FI_DEV_FB_FREE":
fallthrough
case "DCGM_FI_DEV_FB_USED":
fallthrough
case "DCGM_FI_DEV_FB_RESERVED":
// arbitrary max of 10 TiB
assert.GreaterOrEqual(t, metric.asInt64(), int64(0))
assert.LessOrEqual(t, metric.asInt64(), int64(10485760))
value := asInt64(metric)
assert.GreaterOrEqual(t, value, int64(0))
assert.LessOrEqual(t, value, int64(10485760))
case "DCGM_FI_PROF_PCIE_TX_BYTES":
fallthrough
case "DCGM_FI_PROF_PCIE_RX_BYTES":
Expand All @@ -199,8 +213,9 @@ func TestCollectGpuProfilingMetrics(t *testing.T) {
fallthrough
case "DCGM_FI_PROF_NVLINK_RX_BYTES":
// arbitrary max of 10 TiB/sec
assert.GreaterOrEqual(t, metric.asInt64(), int64(0))
assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760))
value := asInt64(metric)
assert.GreaterOrEqual(t, value, int64(0))
assert.LessOrEqual(t, value, int64(10995116277760))
case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION":
fallthrough
case "DCGM_FI_DEV_LOW_UTIL_VIOLATION":
Expand All @@ -216,22 +231,26 @@ func TestCollectGpuProfilingMetrics(t *testing.T) {
case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION":
fallthrough
case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION":
assert.GreaterOrEqual(t, metric.asInt64(), int64(0))
assert.LessOrEqual(t, metric.asInt64(), time.Now().UnixMicro())
value := asInt64(metric)
assert.GreaterOrEqual(t, value, int64(0))
assert.LessOrEqual(t, value, time.Now().UnixMicro())
case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL":
fallthrough
case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL":
// arbitrary max of 100000000 errors
assert.GreaterOrEqual(t, metric.asInt64(), int64(0))
assert.LessOrEqual(t, metric.asInt64(), int64(100000000))
value := asInt64(metric)
assert.GreaterOrEqual(t, value, int64(0))
assert.LessOrEqual(t, value, int64(100000000))
case "DCGM_FI_DEV_GPU_TEMP":
// arbitrary max of 100000 °C
assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0))
assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0))
value := asFloat64(metric)
assert.GreaterOrEqual(t, value, float64(0.0))
assert.LessOrEqual(t, value, float64(100000.0))
case "DCGM_FI_DEV_SM_CLOCK":
// arbitrary max of 100000 MHz
assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0))
assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0))
value := asFloat64(metric)
assert.GreaterOrEqual(t, value, float64(0.0))
assert.LessOrEqual(t, value, float64(100000.0))
case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION":
// TODO
case "DCGM_FI_DEV_POWER_USAGE":
Expand Down
13 changes: 2 additions & 11 deletions receiver/dcgmreceiver/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package dcgmreceiver

import (
"fmt"
"unsafe"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
)
Expand All @@ -32,20 +31,12 @@ var (
errUnexpectedType = fmt.Errorf("unexpected data type")
)

func (m *dcgmMetric) setFloat64(val float64) {
*(*float64)(unsafe.Pointer(&m.value[0])) = val
}

func (m *dcgmMetric) asFloat64() float64 {
return *(*float64)(unsafe.Pointer(&m.value[0]))
}

func (m *dcgmMetric) setInt64(val int64) {
*(*int64)(unsafe.Pointer(&m.value[0])) = val
return m.value.(float64)
}

func (m *dcgmMetric) asInt64() int64 {
return *(*int64)(unsafe.Pointer(&m.value[0]))
return m.value.(int64)
}

func isValidValue(fieldValue dcgm.FieldValue_v1) error {
Expand Down
40 changes: 0 additions & 40 deletions receiver/dcgmreceiver/util_test.go

This file was deleted.

0 comments on commit 22a7484

Please sign in to comment.