diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 35e31d64d..6c1122294 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -58,7 +58,7 @@ type dcgmClient struct { type dcgmMetric struct { timestamp int64 name string - value [4096]byte + value interface{} } // Can't pass argument dcgm.mode because it is unexported @@ -388,7 +388,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) for i := 0; retry && i < client.maxRetries; i++ { fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) if pollErr == nil { - gpuMetrics[gpuIndex], retry = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues) + gpuMetrics[gpuIndex], retry = client.appendMetrics(gpuMetrics[gpuIndex], gpuIndex, fieldValues) if retry { client.logger.Warnf("Retrying poll of DCGM daemon for GPU %d; attempt %d", gpuIndex, i+1) time.Sleep(client.pollingInterval) @@ -406,7 +406,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) return gpuMetrics, err.Combine() } -func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) { +func (client *dcgmClient) appendMetrics(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) { retry = false for _, fieldValue := range fieldValues { dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] @@ -419,13 +419,20 @@ func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, f continue } + var metricValue interface{} switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Float64()) + value := fieldValue.Float64() + client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, value) + metricValue = value case dcgm.DCGM_FT_INT64: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Int64()) + value := fieldValue.Int64() + client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, value) + metricValue = value + default: + metricValue = fieldValue.Value } - gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, fieldValue.Value}) + gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, metricValue}) } return gpuMetrics, retry diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index f8d3dfb9a..5f01f9f89 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -151,6 +151,17 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { after := time.Now().UnixMicro() assert.Nil(t, err) + asFloat64 := func(metric dcgmMetric) float64 { + require.IsTypef(t, float64(0), metric.value, "Unexpected metric type: %T", metric.value) + value, _ := metric.value.(float64) + return value + } + asInt64 := func(metric dcgmMetric) int64 { + require.IsTypef(t, int64(0), metric.value, "Unexpected metric type: %T", metric.value) + value, _ := metric.value.(int64) + return value + } + seenMetric := make(map[string]bool) assert.GreaterOrEqual(t, len(deviceMetrics), 0) assert.LessOrEqual(t, len(deviceMetrics), 32) @@ -172,8 +183,9 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": fallthrough case "DCGM_FI_PROF_DRAM_ACTIVE": - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(1.0)) case "DCGM_FI_DEV_GPU_UTIL": fallthrough case "DCGM_FI_DEV_MEM_COPY_UTIL": @@ -181,16 +193,18 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_DEV_ENC_UTIL": fallthrough case "DCGM_FI_DEV_DEC_UTIL": - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(100)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100)) case "DCGM_FI_DEV_FB_FREE": fallthrough case "DCGM_FI_DEV_FB_USED": fallthrough case "DCGM_FI_DEV_FB_RESERVED": // arbitrary max of 10 TiB - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(10485760)) case "DCGM_FI_PROF_PCIE_TX_BYTES": fallthrough case "DCGM_FI_PROF_PCIE_RX_BYTES": @@ -199,8 +213,9 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { fallthrough case "DCGM_FI_PROF_NVLINK_RX_BYTES": // arbitrary max of 10 TiB/sec - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(10995116277760)) case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": fallthrough case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": @@ -216,22 +231,26 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": fallthrough case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), time.Now().UnixMicro()) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, time.Now().UnixMicro()) case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": fallthrough case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": // arbitrary max of 100000000 errors - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(100000000)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000000)) case "DCGM_FI_DEV_GPU_TEMP": // arbitrary max of 100000 °C - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0)) + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(100000.0)) case "DCGM_FI_DEV_SM_CLOCK": // arbitrary max of 100000 MHz - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0)) + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(100000.0)) case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": // TODO case "DCGM_FI_DEV_POWER_USAGE": diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 2c3457fd4..1f6de1233 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -19,7 +19,6 @@ package dcgmreceiver import ( "fmt" - "unsafe" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) @@ -32,20 +31,12 @@ var ( errUnexpectedType = fmt.Errorf("unexpected data type") ) -func (m *dcgmMetric) setFloat64(val float64) { - *(*float64)(unsafe.Pointer(&m.value[0])) = val -} - func (m *dcgmMetric) asFloat64() float64 { - return *(*float64)(unsafe.Pointer(&m.value[0])) -} - -func (m *dcgmMetric) setInt64(val int64) { - *(*int64)(unsafe.Pointer(&m.value[0])) = val + return m.value.(float64) } func (m *dcgmMetric) asInt64() int64 { - return *(*int64)(unsafe.Pointer(&m.value[0])) + return m.value.(int64) } func isValidValue(fieldValue dcgm.FieldValue_v1) error { diff --git a/receiver/dcgmreceiver/util_test.go b/receiver/dcgmreceiver/util_test.go deleted file mode 100644 index daeace14d..000000000 --- a/receiver/dcgmreceiver/util_test.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build gpu -// +build gpu - -package dcgmreceiver - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestDcgmMetricSetFloat64(t *testing.T) { - var metric dcgmMetric - metric.setFloat64(23.0) - require.Equal(t, metric.asFloat64(), 23.0) - metric.setFloat64(43.0) - require.Equal(t, metric.asFloat64(), 43.0) -} - -func TestDcgmMetricSetInt64(t *testing.T) { - var metric dcgmMetric - metric.setInt64(23) - require.Equal(t, metric.asInt64(), int64(23)) - metric.setInt64(43) - require.Equal(t, metric.asInt64(), int64(43)) -}