From f1f132c7c7aa5041901231a3537e968327952ad6 Mon Sep 17 00:00:00 2001 From: igorpeshansky <7594381+igorpeshansky@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:52:31 -0400 Subject: [PATCH] Implement an initial set of uniform DCGM GPU metrics in `dcgmreceiver`. (#219) Co-authored-by: Quentin Smith --- .golangci.yaml | 1 + go.mod | 2 + go.sum | 4 + receiver/dcgmreceiver/client.go | 360 ++++--- receiver/dcgmreceiver/client_gpu_test.go | 331 ++++-- receiver/dcgmreceiver/client_test.go | 4 +- receiver/dcgmreceiver/component_test.go | 13 +- receiver/dcgmreceiver/documentation.md | 164 ++- receiver/dcgmreceiver/factory_gpu_on.go | 28 +- .../dcgmreceiver/generated_package_test.go | 3 +- .../internal/metadata/generated_config.go | 118 ++- .../metadata/generated_config_test.go | 111 +- .../internal/metadata/generated_metrics.go | 986 ++++++++++++++---- .../metadata/generated_metrics_test.go | 328 +++--- .../internal/metadata/generated_resource.go | 50 + .../metadata/generated_resource_test.go | 52 + .../internal/metadata/testdata/config.yaml | 106 +- receiver/dcgmreceiver/metadata.yaml | 177 +++- receiver/dcgmreceiver/scraper.go | 366 +++++-- receiver/dcgmreceiver/scraper_gpu_test.go | 345 ++++-- receiver/dcgmreceiver/scraper_test.go | 8 + .../testdata/NVIDIA_A100-SXM4-40GB.yaml | 40 +- .../testdata/NVIDIA_H100_80GB_HBM3.yaml | 35 + receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml | 38 +- receiver/dcgmreceiver/testdata/Tesla_K80.yaml | 17 - .../testdata/Tesla_P100-PCIE-16GB.yaml | 40 +- receiver/dcgmreceiver/testdata/Tesla_P4.yaml | 40 +- receiver/dcgmreceiver/testdata/Tesla_T4.yaml | 40 +- .../testdata/Tesla_V100-SXM2-16GB.yaml | 40 +- .../testprofilepause/test_profile_pause.go | 54 +- receiver/dcgmreceiver/util.go | 130 ++- receiver/dcgmreceiver/util_test.go | 119 ++- service/components.go | 4 + 33 files changed, 3092 insertions(+), 1062 deletions(-) create mode 100644 receiver/dcgmreceiver/internal/metadata/generated_resource.go create mode 100644 receiver/dcgmreceiver/internal/metadata/generated_resource_test.go create mode 100644 receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml delete mode 100644 receiver/dcgmreceiver/testdata/Tesla_K80.yaml diff --git a/.golangci.yaml b/.golangci.yaml index 349cc25c1..3c6fe5109 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -57,6 +57,7 @@ linters-settings: - (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf disable: - fieldalignment + - shadow enable-all: true misspell: locale: US diff --git a/go.mod b/go.mod index 7f51e2db6..5087d9a20 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,8 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlecloudexporter v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlemanagedprometheusexporter v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.102.0 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.102.0 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor v0.102.0 diff --git a/go.sum b/go.sum index 1e259b057..838f0e1ac 100644 --- a/go.sum +++ b/go.sum @@ -735,6 +735,10 @@ github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometh github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheusremotewrite v0.102.0/go.mod h1:+Vlutd4t2XluxHYbIAfZiz3z5uWbsbiIUpipV5AnLtk= github.com/open-telemetry/opentelemetry-collector-contrib/pkg/winperfcounters v0.102.0 h1:adfJy3Sev2MaD6+plcmsSecpzy8h4MJT7eXEuif/2Ew= github.com/open-telemetry/opentelemetry-collector-contrib/pkg/winperfcounters v0.102.0/go.mod h1:FJmA939yem9GSEbqjCK6CXVbPfNPFKhvKnn+nWNpWio= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.102.0 h1:q4VV17TxeMm0FOeyFXAO4gSRf2ZLtKTh0/l5goxhRsY= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.102.0/go.mod h1:FlP/8TVT768TAh5kpvVX3AQ5/UXJWBuSSCFhO3fE+E0= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0 h1:mj3t9/FAQZjcZJA2kjgbpz2fSK9yD/pYpmqKEWpHJ1A= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0/go.mod h1:IIIjEblgrNISbDY7GPMMto9kEVIf0n9IeJoVru89kfY= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0 h1:DaEYlVCn58GtkyYVK0IT/ZMjRFJ+BfmR0p9I0Eq42aQ= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0/go.mod h1:u9x08rUCWdgI8Nle5XOMTCmxd0K26KTZvMMA5H8Xjyg= github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.102.0 h1:huh7V8uqMakQGdnbOqTSZihfoDeOIbNHfFt62HMsk5k= diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 8f4baef2a..7986d4f5d 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -20,33 +20,47 @@ package dcgmreceiver import ( "errors" "fmt" + "math" "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "go.opentelemetry.io/collector/receiver/scrapererror" "go.uber.org/zap" ) const maxWarningsForFailedDeviceMetricQuery = 5 +const dcgmProfilingFieldsStart = dcgm.Short(1000) + var ErrDcgmInitialization = errors.New("error initializing DCGM") -type dcgmClient struct { - logger *zap.SugaredLogger - handleCleanup func() - enabledFieldIDs []dcgm.Short - enabledFieldGroup dcgm.FieldHandle - deviceIndices []uint - devicesModelName []string - devicesUUID []string - deviceMetricToFailedQueryCount map[string]uint64 +type dcgmClientSettings struct { + endpoint string + pollingInterval time.Duration + retryBlankValues bool + maxRetries int + fields []string } -type dcgmMetric struct { - timestamp int64 - gpuIndex uint - name string - value [4096]byte +type deviceMetrics struct { + ModelName string + UUID string + Metrics MetricsMap +} + +type dcgmClient struct { + logger *zap.SugaredLogger + handleCleanup func() + enabledFieldIDs []dcgm.Short + enabledFieldGroup dcgm.FieldHandle + deviceGroup dcgm.GroupHandle + + devices map[uint]deviceMetrics + lastSuccessfulPoll time.Time + + deviceMetricToFailedQueryCount map[string]int + pollingInterval time.Duration + retryBlankValues bool + maxRetries int } // Can't pass argument dcgm.mode because it is unexported @@ -54,39 +68,40 @@ var dcgmInit = func(args ...string) (func(), error) { return dcgm.Init(dcgm.Standalone, args...) } -var dcgmGetLatestValuesForFields = dcgm.GetLatestValuesForFields +var dcgmGetValuesSince = dcgm.GetValuesSince -func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { - dcgmCleanup, err := initializeDcgm(config, logger) +func newClient(settings *dcgmClientSettings, logger *zap.Logger) (*dcgmClient, error) { + dcgmCleanup, err := initializeDcgm(settings.endpoint, logger) if err != nil { return nil, errors.Join(ErrDcgmInitialization, err) } - deviceIndices := make([]uint, 0) - names := make([]string, 0) - UUIDs := make([]string, 0) enabledFieldGroup := dcgm.FieldHandle{} - requestedFieldIDs := discoverRequestedFieldIDs(config) - supportedFieldIDs, err := getAllSupportedFields() + requestedFieldIDs := toFieldIDs(settings.fields) + supportedProfilingFieldIDs, err := getSupportedProfilingFields() if err != nil { // If there is error querying the supported fields at all, let the // receiver collect basic metrics: (GPU utilization, used/free memory). logger.Sugar().Warnf("Error querying supported profiling fields on '%w'. GPU profiling metrics will not be collected.", err) } - enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedFieldIDs) + enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs) for _, f := range unavailableFields { - logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmNameToMetricName[dcgmIDToName[f]]) + logger.Sugar().Warnf("Field '%s' is not supported", dcgmIDToName[f]) } + var deviceGroup dcgm.GroupHandle if len(enabledFields) != 0 { - deviceIndices, names, UUIDs, err = discoverDevices(logger) + supportedDeviceIndices, err := dcgm.GetSupportedDevices() if err != nil { - return nil, err + return nil, fmt.Errorf("Unable to discover supported GPUs on %w", err) } - deviceGroup, err := createDeviceGroup(logger, deviceIndices) + logger.Sugar().Infof("Discovered %d supported GPU devices", len(supportedDeviceIndices)) + + deviceGroup, err = createDeviceGroup(logger, supportedDeviceIndices) if err != nil { return nil, err } - enabledFieldGroup, err = setWatchesOnEnabledFields(config, logger, deviceGroup, enabledFields) + enabledFieldGroup, err = setWatchesOnEnabledFields(settings.pollingInterval, logger, deviceGroup, enabledFields) if err != nil { + _ = dcgm.FieldGroupDestroy(enabledFieldGroup) return nil, fmt.Errorf("Unable to set field watches on %w", err) } } @@ -95,54 +110,47 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { handleCleanup: dcgmCleanup, enabledFieldIDs: enabledFields, enabledFieldGroup: enabledFieldGroup, - deviceIndices: deviceIndices, - devicesModelName: names, - devicesUUID: UUIDs, - deviceMetricToFailedQueryCount: make(map[string]uint64), + deviceGroup: deviceGroup, + devices: map[uint]deviceMetrics{}, + lastSuccessfulPoll: time.Now(), + deviceMetricToFailedQueryCount: make(map[string]int), + pollingInterval: settings.pollingInterval, + retryBlankValues: settings.retryBlankValues, + maxRetries: settings.maxRetries, }, nil } // initializeDcgm tries to initialize a DCGM connection; returns a cleanup func // only if the connection is initialized successfully without error -func initializeDcgm(config *Config, logger *zap.Logger) (func(), error) { +func initializeDcgm(endpoint string, logger *zap.Logger) (func(), error) { isSocket := "0" - dcgmCleanup, err := dcgmInit(config.TCPAddrConfig.Endpoint, isSocket) + dcgmCleanup, err := dcgmInit(endpoint, isSocket) if err != nil { - msg := fmt.Sprintf("Unable to connect to DCGM daemon at %s on %v; Is the DCGM daemon running?", config.TCPAddrConfig.Endpoint, err) + msg := fmt.Sprintf("Unable to connect to DCGM daemon at %s on %v; Is the DCGM daemon running?", endpoint, err) logger.Sugar().Warn(msg) if dcgmCleanup != nil { dcgmCleanup() } return nil, fmt.Errorf("%s", msg) } - logger.Sugar().Infof("Connected to DCGM daemon at %s", config.TCPAddrConfig.Endpoint) + logger.Sugar().Infof("Connected to DCGM daemon at %s", endpoint) return dcgmCleanup, nil } -func discoverDevices(logger *zap.Logger) ([]uint, []string, []string, error) { - supportedDeviceIndices, err := dcgm.GetSupportedDevices() +func newDeviceMetrics(logger *zap.SugaredLogger, gpuIndex uint) (deviceMetrics, error) { + deviceInfo, err := dcgm.GetDeviceInfo(gpuIndex) if err != nil { - return nil, nil, nil, fmt.Errorf("Unable to discover supported GPUs on %w", err) + logger.Warnf("Unable to query device info for NVIDIA device %d on '%w'", gpuIndex, err) + return deviceMetrics{}, err } - logger.Sugar().Infof("Discovered %d supported GPU devices", len(supportedDeviceIndices)) - devices := make([]uint, 0, len(supportedDeviceIndices)) - names := make([]string, 0, len(supportedDeviceIndices)) - UUIDs := make([]string, 0, len(supportedDeviceIndices)) - for _, gpuIndex := range supportedDeviceIndices { - deviceInfo, err := dcgm.GetDeviceInfo(gpuIndex) - if err != nil { - logger.Sugar().Warnf("Unable to query device info for NVIDIA device %d on '%w'", gpuIndex, err) - continue - } - - devices = append(devices, gpuIndex) - names = append(names, deviceInfo.Identifiers.Model) - UUIDs = append(UUIDs, deviceInfo.UUID) - logger.Sugar().Infof("Discovered NVIDIA device %s with UUID %s", names[gpuIndex], UUIDs[gpuIndex]) + device := deviceMetrics{ + ModelName: deviceInfo.Identifiers.Model, + UUID: deviceInfo.UUID, + Metrics: MetricsMap{}, } - - return devices, names, UUIDs, nil + logger.Infof("Discovered NVIDIA device %s with UUID %s (DCGM GPU ID %d)", device.ModelName, device.UUID, gpuIndex) + return device, nil } func createDeviceGroup(logger *zap.Logger, deviceIndices []uint) (dcgm.GroupHandle, error) { @@ -163,52 +171,18 @@ func createDeviceGroup(logger *zap.Logger, deviceIndices []uint) (dcgm.GroupHand return deviceGroup, nil } -func discoverRequestedFieldIDs(config *Config) []dcgm.Short { - requestedFieldIDs := []dcgm.Short{} - if config.Metrics.DcgmGpuUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_GPU_UTIL"]) - } - if config.Metrics.DcgmGpuMemoryBytesUsed.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_USED"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_FREE"]) +func toFieldIDs(fields []string) []dcgm.Short { + requestedFieldIDs := make([]dcgm.Short, len(fields)) + for i, f := range fields { + requestedFieldIDs[i] = dcgm.DCGM_FI[f] } - if config.Metrics.DcgmGpuProfilingSmUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_SM_ACTIVE"]) - } - if config.Metrics.DcgmGpuProfilingSmOccupancy.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_SM_OCCUPANCY"]) - } - if config.Metrics.DcgmGpuProfilingPipeUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP64_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP32_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP16_ACTIVE"]) - } - if config.Metrics.DcgmGpuProfilingDramUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_DRAM_ACTIVE"]) - } - if config.Metrics.DcgmGpuProfilingPcieTrafficRate.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_TX_BYTES"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_RX_BYTES"]) - } - if config.Metrics.DcgmGpuProfilingNvlinkTrafficRate.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_TX_BYTES"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_RX_BYTES"]) - } - return requestedFieldIDs } -// getAllSupportedFields calls the DCGM query function to find out all the -// fields that are supported by the current GPUs -func getAllSupportedFields() ([]dcgm.Short, error) { - // Fields like `DCGM_FI_DEV_*` are not profiling fields, and they are always - // supported on all devices - supported := []dcgm.Short{ - dcgm.DCGM_FI["DCGM_FI_DEV_GPU_UTIL"], - dcgm.DCGM_FI["DCGM_FI_DEV_FB_USED"], - dcgm.DCGM_FI["DCGM_FI_DEV_FB_FREE"], - } +// getSupportedProfilingFields calls the DCGM query function to find out all +// profiling fields that are supported by the current GPUs +func getSupportedProfilingFields() ([]dcgm.Short, error) { + supported := []dcgm.Short{} // GetSupportedMetricGroups currently does not support passing the actual // group handle; here we pass 0 to query supported fields for group 0, which // is the default DCGM group that is **supposed** to include all GPUs of the @@ -236,57 +210,82 @@ func getAllSupportedFields() ([]dcgm.Short, error) { } // filterSupportedFields takes the user requested fields and device supported -// fields, and filter to return those that are requested & supported to be the -// enabledFields and requested but not supported as unavailableFields -func filterSupportedFields(requestedFields []dcgm.Short, supportedFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { +// profiling fields, and filters to return those that are requested & supported +// to be the enabledFields and requested but not supported as unavailableFields +func filterSupportedFields(requestedFields []dcgm.Short, supportedProfilingFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { var enabledFields []dcgm.Short var unavailableFields []dcgm.Short for _, ef := range requestedFields { - support := false - for _, sf := range supportedFields { + // For fields like `DCGM_FI_DEV_*`, which are not + // profiling fields, assume they are always present. + support := ef < dcgmProfilingFieldsStart + for _, sf := range supportedProfilingFields { if sf == ef { - enabledFields = append(enabledFields, ef) support = true break } } - if !support { + if support { + enabledFields = append(enabledFields, ef) + } else { unavailableFields = append(unavailableFields, ef) } } return enabledFields, unavailableFields } -func setWatchesOnEnabledFields(config *Config, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { +// Internal-only +type dcgmWatchParams struct { + fieldGroupName string + updateFreqUs int64 + maxKeepTime float64 + maxKeepSamples int32 +} + +// Internal-only +func setWatchesOnFields(logger *zap.Logger, deviceGroup dcgm.GroupHandle, fieldIDs []dcgm.Short, params dcgmWatchParams) (dcgm.FieldHandle, error) { var err error - // Note: Add random suffix to avoid conflict amongnst any parallel collectors - fieldGroupName := fmt.Sprintf("google-cloud-ops-agent-metrics-%d", randSource.Intn(10000)) - enabledFieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, enabledFieldIDs) + fieldGroup, err := dcgm.FieldGroupCreate(params.fieldGroupName, fieldIDs) if err != nil { - return dcgm.FieldHandle{}, fmt.Errorf("Unable to create DCGM field group '%s'", fieldGroupName) + return dcgm.FieldHandle{}, fmt.Errorf("Unable to create DCGM field group '%s'", params.fieldGroupName) } - msg := fmt.Sprintf("Created DCGM field group '%s' with field ids: ", fieldGroupName) - for _, fieldID := range enabledFieldIDs { + msg := fmt.Sprintf("Created DCGM field group '%s' with field ids: ", params.fieldGroupName) + for _, fieldID := range fieldIDs { msg += fmt.Sprintf("%d ", fieldID) } logger.Sugar().Info(msg) // Note: DCGM retained samples = Max(maxKeepSamples, maxKeepTime/updateFreq) - dcgmUpdateFreq := int64(config.CollectionInterval / time.Microsecond) - dcgmMaxKeepTime := 600.0 /* 10 min */ - dcgmMaxKeepSamples := int32(15) - err = dcgm.WatchFieldsWithGroupEx(enabledFieldGroup, deviceGroup, dcgmUpdateFreq, dcgmMaxKeepTime, dcgmMaxKeepSamples) + dcgmUpdateFreq := params.updateFreqUs + dcgmMaxKeepTime := params.maxKeepTime + dcgmMaxKeepSamples := params.maxKeepSamples + err = dcgm.WatchFieldsWithGroupEx(fieldGroup, deviceGroup, dcgmUpdateFreq, dcgmMaxKeepTime, dcgmMaxKeepSamples) if err != nil { - return dcgm.FieldHandle{}, fmt.Errorf("Setting watches for DCGM field group '%s' failed on %w", fieldGroupName, err) + return fieldGroup, fmt.Errorf("Setting watches for DCGM field group '%s' failed on %w", params.fieldGroupName, err) } - logger.Sugar().Infof("Setting watches for DCGM field group '%s' succeeded", fieldGroupName) + logger.Sugar().Infof("Setting watches for DCGM field group '%s' succeeded", params.fieldGroupName) - return enabledFieldGroup, nil + return fieldGroup, nil +} + +const maxKeepSamples = 100 // TODO: Is this enough? + +func setWatchesOnEnabledFields(pollingInterval time.Duration, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { + return setWatchesOnFields(logger, deviceGroup, enabledFieldIDs, dcgmWatchParams{ + // Note: Add random suffix to avoid conflict amongnst any parallel collectors + fieldGroupName: fmt.Sprintf("google-cloud-ops-agent-metrics-%d", randSource.Intn(10000)), + // Note: DCGM retained samples = Max(maxKeepSamples, maxKeepTime/updateFreq) + updateFreqUs: int64(pollingInterval / time.Microsecond), + maxKeepTime: 600.0, /* 10 min */ + maxKeepSamples: maxKeepSamples, + }) } func (client *dcgmClient) cleanup() { + _ = dcgm.FieldGroupDestroy(client.enabledFieldGroup) + _ = dcgm.DestroyGroup(client.deviceGroup) if client.handleCleanup != nil { client.handleCleanup() } @@ -294,62 +293,91 @@ func (client *dcgmClient) cleanup() { client.logger.Info("Shutdown DCGM") } -func (client *dcgmClient) getDeviceModelName(gpuIndex uint) string { - return client.devicesModelName[gpuIndex] -} - -func (client *dcgmClient) getDeviceUUID(gpuIndex uint) string { - return client.devicesUUID[gpuIndex] -} - -func (client *dcgmClient) collectDeviceMetrics() ([]dcgmMetric, error) { - var err scrapererror.ScrapeErrors - gpuMetrics := make([]dcgmMetric, 0, len(client.enabledFieldIDs)*len(client.deviceIndices)) - for _, gpuIndex := range client.deviceIndices { - fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) - if pollErr == nil { - gpuMetrics = client.appendMetric(gpuMetrics, gpuIndex, fieldValues) - client.logger.Debugf("Successful poll of DCGM daemon for GPU %d", gpuIndex) - } else { - msg := fmt.Sprintf("Unable to poll DCGM daemon for GPU %d on %s", gpuIndex, pollErr) - client.issueWarningForFailedQueryUptoThreshold(gpuIndex, "all-profiling-metrics", msg) - err.AddPartial(1, fmt.Errorf("%s", msg)) - } +// collect will poll dcgm for any new metrics, updating client.devices as appropriate +// It returns the estimated polling interval. +func (client *dcgmClient) collect() (time.Duration, error) { + client.logger.Debugf("Polling DCGM daemon for field values") + if len(client.enabledFieldIDs) == 0 { + // Make sure we don't try to scrape without a device group (since we don't construct one when there are no enabled fields). + return 0, nil } - - return gpuMetrics, err.Combine() -} - -func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) []dcgmMetric { + fieldValues, pollTime, err := dcgmGetValuesSince(client.deviceGroup, client.enabledFieldGroup, client.lastSuccessfulPoll) + if err != nil { + msg := fmt.Sprintf("Unable to poll DCGM daemon for metrics: %s", err) + client.issueWarningForFailedQueryUptoThreshold("all-profiling-metrics", maxWarningsForFailedDeviceMetricQuery, msg) + return 0, err + } + client.logger.Debugf("Got %d field values over %s", len(fieldValues), pollTime.Sub(client.lastSuccessfulPoll)) + client.lastSuccessfulPoll = pollTime + oldestTs := int64(math.MaxInt64) + newestTs := int64(0) for _, fieldValue := range fieldValues { - metricName := dcgmNameToMetricName[dcgmIDToName[dcgm.Short(fieldValue.FieldId)]] - if !isValidValue(fieldValue) { - msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s", fieldValue.Ts, gpuIndex, metricName) - client.issueWarningForFailedQueryUptoThreshold(gpuIndex, metricName, msg) + if fieldValue.EntityGroupId != dcgm.FE_GPU { continue } - - switch fieldValue.FieldType { - case dcgm.DCGM_FT_DOUBLE: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, metricName, fieldValue.Float64()) - case dcgm.DCGM_FT_INT64: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, metricName, fieldValue.Int64()) + gpuIndex := fieldValue.EntityId + if _, ok := client.devices[gpuIndex]; !ok { + device, err := newDeviceMetrics(client.logger, gpuIndex) + if err != nil { + continue + } + client.devices[gpuIndex] = device + } + device := client.devices[gpuIndex] + dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] + if err := isValidValue(fieldValue); err == errBlankValue { + // Blank values are expected at startup. + continue + } else if err == errNotSupported { + client.issueWarningForFailedQueryUptoThreshold(dcgmName, 1, fmt.Sprintf("Field '%s' is not supported", dcgmName)) + continue + } else if err != nil { + msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) + client.issueWarningForFailedQueryUptoThreshold(fmt.Sprintf("device%d.%s", gpuIndex, dcgmName), maxWarningsForFailedDeviceMetricQuery, msg) + continue } - gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, gpuIndex, metricName, fieldValue.Value}) + if fieldValue.Ts < oldestTs { + oldestTs = fieldValue.Ts + } + if fieldValue.Ts > newestTs { + newestTs = fieldValue.Ts + } + if _, ok := device.Metrics[dcgmName]; !ok { + device.Metrics[dcgmName] = &metricStats{} + } + device.Metrics[dcgmName].Update(fieldValue) } + duration := time.Duration(newestTs-oldestTs) * time.Microsecond + client.logger.Debugf("Successful poll of DCGM daemon returned %v of data", duration) + // If we did a partial poll, there should be more room in the buffer. + duration = max(duration, client.pollingInterval*maxKeepSamples) + return duration, nil +} - return gpuMetrics +// getDeviceMetrics returns a deep copy of client.devices +func (client *dcgmClient) getDeviceMetrics() map[uint]deviceMetrics { + out := map[uint]deviceMetrics{} + for gpuIndex, device := range client.devices { + newMetrics := MetricsMap{} + for key, value := range device.Metrics { + newValue := *value + newMetrics[key] = &newValue + } + // device is already a copy here + device.Metrics = newMetrics + out[gpuIndex] = device + } + return out } -func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(deviceIdx uint, metricName string, reason string) { - deviceMetric := fmt.Sprintf("device%d.%s", deviceIdx, metricName) - client.deviceMetricToFailedQueryCount[deviceMetric]++ +func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(dcgmName string, limit int, reason string) { + client.deviceMetricToFailedQueryCount[dcgmName]++ - failedCount := client.deviceMetricToFailedQueryCount[deviceMetric] - if failedCount <= maxWarningsForFailedDeviceMetricQuery { - client.logger.Warnf("Unable to query '%s' for Nvidia device %d on '%s'", metricName, deviceIdx, reason) - if failedCount == maxWarningsForFailedDeviceMetricQuery { - client.logger.Warnf("Surpressing further device query warnings for '%s' for Nvidia device %d", metricName, deviceIdx) + failedCount := client.deviceMetricToFailedQueryCount[dcgmName] + if failedCount <= limit { + client.logger.Warn(reason) + if limit > 1 && failedCount == limit { + client.logger.Warnf("Surpressing further device query warnings for '%s'", dcgmName) } } } diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 76ad622a7..24d79bec3 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -24,6 +24,8 @@ import ( "io/ioutil" "os" "path" + "slices" + "sort" "strings" "testing" "time" @@ -48,67 +50,67 @@ type modelSupportedFields struct { UnsupportedFields []string `yaml:"unsupported_fields"` } -// TestSupportedFieldsWithGolden test getAllSupportedFields() against the golden -// files for the current GPU model -func TestSupportedFieldsWithGolden(t *testing.T) { - config := createDefaultConfig().(*Config) - client, err := newClient(config, zaptest.NewLogger(t)) +func defaultClientSettings() *dcgmClientSettings { + requestedFields := discoverRequestedFields(createDefaultConfig().(*Config)) + return &dcgmClientSettings{ + endpoint: defaultEndpoint, + pollingInterval: 1 * time.Second, + retryBlankValues: true, + maxRetries: 5, + fields: requestedFields, + } +} + +// TestSupportedProfilingFieldsWithGolden tests getSupportedRegularFields() and +// getSupportedProfilingFields() against the golden files for the current GPU +// model +func TestSupportedProfilingFieldsWithGolden(t *testing.T) { + clientSettings := defaultClientSettings() + client, err := newClient(clientSettings, zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") + defer client.cleanup() - assert.NotEmpty(t, client.devicesModelName) - gpuModel := client.getDeviceModelName(0) - allFields := discoverRequestedFieldIDs(config) - supportedFields, err := getAllSupportedFields() + allFields := toFieldIDs(clientSettings.fields) + supportedProfilingFields, err := getSupportedProfilingFields() require.Nil(t, err) - enabledFields, unavailableFields := filterSupportedFields(allFields, supportedFields) + enabledFields, unavailableFields := filterSupportedFields(allFields, supportedProfilingFields) - dcgmIDToNameMap := make(map[dcgm.Short]string, len(dcgm.DCGM_FI)) - for fieldName, fieldID := range dcgm.DCGM_FI { - dcgmIDToNameMap[fieldID] = fieldName - } var enabledFieldsString []string var unavailableFieldsString []string for _, f := range enabledFields { - enabledFieldsString = append(enabledFieldsString, dcgmIDToNameMap[f]) + name := dcgmIDToName[f] + if !strings.HasPrefix(name, "DCGM_FI_DEV_") { + enabledFieldsString = append(enabledFieldsString, name) + } } for _, f := range unavailableFields { - unavailableFieldsString = append(unavailableFieldsString, dcgmIDToNameMap[f]) - } - m := modelSupportedFields{ - Model: gpuModel, - SupportedFields: enabledFieldsString, - UnsupportedFields: unavailableFieldsString, - } - actual, err := yaml.Marshal(&m) - if err != nil { - t.Fatal(err) + name := dcgmIDToName[f] + if !strings.HasPrefix(name, "DCGM_FI_DEV_") { + unavailableFieldsString = append(unavailableFieldsString, name) + } } - assert.Equal(t, len(dcgmNameToMetricName), len(client.enabledFieldIDs)+len(unavailableFieldsString)) - goldenPath := getModelGoldenFilePath(t, gpuModel) - golden.Assert(t, string(actual), goldenPath) - client.cleanup() + sort.Strings(enabledFieldsString) + sort.Strings(unavailableFieldsString) + _, err = client.collect() + require.Nil(t, err) + require.NotEmpty(t, client.devices) + gpuModel := client.devices[0].ModelName + + want := LoadExpectedMetrics(t, gpuModel) + want.SupportedFields = slices.DeleteFunc(want.SupportedFields, func(name string) bool { + return strings.HasPrefix(name, "DCGM_FI_DEV_") + }) + want.UnsupportedFields = slices.DeleteFunc(want.UnsupportedFields, func(name string) bool { + return strings.HasPrefix(name, "DCGM_FI_DEV_") + }) + assert.ElementsMatch(t, enabledFieldsString, want.SupportedFields, "supported profiling fields") + assert.ElementsMatch(t, unavailableFieldsString, want.UnsupportedFields) } // LoadExpectedMetrics read the supported metrics of a GPU model from the golden // file, given a GPU model string -func LoadExpectedMetrics(t *testing.T, model string) []string { +func LoadExpectedMetrics(t *testing.T, model string) modelSupportedFields { t.Helper() - dcgmNameToMetricNameMap := map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": "dcgm.gpu.utilization", - "DCGM_FI_DEV_FB_USED": "dcgm.gpu.memory.bytes_used", - "DCGM_FI_DEV_FB_FREE": "dcgm.gpu.memory.bytes_free", - "DCGM_FI_PROF_SM_ACTIVE": "dcgm.gpu.profiling.sm_utilization", - "DCGM_FI_PROF_SM_OCCUPANCY": "dcgm.gpu.profiling.sm_occupancy", - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "dcgm.gpu.profiling.tensor_utilization", - "DCGM_FI_PROF_DRAM_ACTIVE": "dcgm.gpu.profiling.dram_utilization", - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "dcgm.gpu.profiling.fp64_utilization", - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "dcgm.gpu.profiling.fp32_utilization", - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "dcgm.gpu.profiling.fp16_utilization", - "DCGM_FI_PROF_PCIE_TX_BYTES": "dcgm.gpu.profiling.pcie_sent_bytes", - "DCGM_FI_PROF_PCIE_RX_BYTES": "dcgm.gpu.profiling.pcie_received_bytes", - "DCGM_FI_PROF_NVLINK_TX_BYTES": "dcgm.gpu.profiling.nvlink_sent_bytes", - "DCGM_FI_PROF_NVLINK_RX_BYTES": "dcgm.gpu.profiling.nvlink_received_bytes", - } goldenPath := getModelGoldenFilePath(t, model) goldenFile, err := ioutil.ReadFile(goldenPath) if err != nil { @@ -119,11 +121,7 @@ func LoadExpectedMetrics(t *testing.T, model string) []string { if err != nil { t.Fatal(err) } - var expectedMetrics []string - for _, supported := range m.SupportedFields { - expectedMetrics = append(expectedMetrics, dcgmNameToMetricNameMap[supported]) - } - return expectedMetrics + return m } // getModelGoldenFilePath returns golden file path given a GPU model string @@ -137,83 +135,190 @@ func getModelGoldenFilePath(t *testing.T, model string) string { } func TestNewDcgmClientWithGpuPresent(t *testing.T) { - client, err := newClient(createDefaultConfig().(*Config), zaptest.NewLogger(t)) + client, err := newClient(defaultClientSettings(), zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") assert.NotNil(t, client) assert.NotNil(t, client.handleCleanup) - assert.Greater(t, len(client.deviceIndices), 0) - for gpuIndex := range client.deviceIndices { - assert.Greater(t, len(client.devicesModelName[gpuIndex]), 0) - assert.Greater(t, len(client.devicesUUID[gpuIndex]), 0) - } client.cleanup() } func TestCollectGpuProfilingMetrics(t *testing.T) { - client, err := newClient(createDefaultConfig().(*Config), zaptest.NewLogger(t)) + clientSettings := defaultClientSettings() + client, err := newClient(clientSettings, zaptest.NewLogger(t)) + defer client.cleanup() require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") - expectedMetrics := LoadExpectedMetrics(t, client.devicesModelName[0]) var maxCollectionInterval = 60 * time.Second - before := time.Now().UnixMicro() - maxCollectionInterval.Microseconds() - metrics, err := client.collectDeviceMetrics() - after := time.Now().UnixMicro() - assert.Nil(t, err) - - seenMetric := make(map[string]bool) - for _, metric := range metrics { - assert.GreaterOrEqual(t, metric.gpuIndex, uint(0)) - assert.LessOrEqual(t, metric.gpuIndex, uint(32)) - - switch metric.name { - case "dcgm.gpu.profiling.tensor_utilization": - fallthrough - case "dcgm.gpu.profiling.dram_utilization": - fallthrough - case "dcgm.gpu.profiling.fp64_utilization": - fallthrough - case "dcgm.gpu.profiling.fp32_utilization": - fallthrough - case "dcgm.gpu.profiling.fp16_utilization": - fallthrough - case "dcgm.gpu.profiling.sm_occupancy": - fallthrough - case "dcgm.gpu.profiling.sm_utilization": - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) - case "dcgm.gpu.utilization": - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(100)) - case "dcgm.gpu.memory.bytes_free": - fallthrough - case "dcgm.gpu.memory.bytes_used": - // arbitrary max of 10 TiB - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) - case "dcgm.gpu.profiling.pcie_sent_bytes": - fallthrough - case "dcgm.gpu.profiling.pcie_received_bytes": - fallthrough - case "dcgm.gpu.profiling.nvlink_sent_bytes": - fallthrough - case "dcgm.gpu.profiling.nvlink_received_bytes": - // arbitrary max of 10 TiB/sec - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) - default: - t.Errorf("Unexpected metric '%s'", metric.name) + var before, after int64 + for { + before = time.Now().UnixMicro() - maxCollectionInterval.Microseconds() + duration, err := client.collect() + after = time.Now().UnixMicro() + assert.Greater(t, duration, time.Duration(0)) + assert.Nil(t, err) + var metricCount int + for _, device := range client.devices { + for _, metric := range device.Metrics { + if metric.lastFieldValue != nil { + metricCount++ + } + } + } + if metricCount > 0 { + break } + time.Sleep(client.pollingInterval) + } + deviceMetrics := client.devices + + lastFloat64 := func(metric *metricStats) float64 { + assert.Equal(t, dcgm.DCGM_FT_DOUBLE, metric.lastFieldValue.FieldType, "Unexpected metric type: %+v", metric.lastFieldValue) + value, ok := asFloat64(*metric.lastFieldValue) + require.True(t, ok, "Unexpected metric type: %+v", metric.lastFieldValue) + return value + } + lastInt64 := func(metric *metricStats) int64 { + assert.Equal(t, dcgm.DCGM_FT_INT64, metric.lastFieldValue.FieldType, "Unexpected metric type: %+v", metric.lastFieldValue) + value, ok := asInt64(*metric.lastFieldValue) + require.True(t, ok, "Unexpected metric type: %+v", metric.lastFieldValue) + return value + } + + seenMetric := make(map[string]int) + assert.GreaterOrEqual(t, len(deviceMetrics), 0) + assert.LessOrEqual(t, len(deviceMetrics), 32) + for _, device := range deviceMetrics { + for name, metric := range device.Metrics { + switch name { + case "DCGM_FI_PROF_GR_ENGINE_ACTIVE": + fallthrough + case "DCGM_FI_PROF_SM_ACTIVE": + fallthrough + case "DCGM_FI_PROF_SM_OCCUPANCY": + fallthrough + case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": + fallthrough + case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": + fallthrough + case "DCGM_FI_PROF_PIPE_FP32_ACTIVE": + fallthrough + case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": + fallthrough + case "DCGM_FI_PROF_DRAM_ACTIVE": + value := lastFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(1.0)) + case "DCGM_FI_DEV_GPU_UTIL": + fallthrough + case "DCGM_FI_DEV_MEM_COPY_UTIL": + fallthrough + case "DCGM_FI_DEV_ENC_UTIL": + fallthrough + case "DCGM_FI_DEV_DEC_UTIL": + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100)) + case "DCGM_FI_DEV_FB_FREE": + fallthrough + case "DCGM_FI_DEV_FB_USED": + fallthrough + case "DCGM_FI_DEV_FB_RESERVED": + // arbitrary max of 10 TiB + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(10485760)) + case "DCGM_FI_PROF_PCIE_TX_BYTES": + fallthrough + case "DCGM_FI_PROF_PCIE_RX_BYTES": + fallthrough + case "DCGM_FI_PROF_NVLINK_TX_BYTES": + fallthrough + case "DCGM_FI_PROF_NVLINK_RX_BYTES": + // arbitrary max of 10 TiB/sec + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(10995116277760)) + case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": + fallthrough + case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": + fallthrough + case "DCGM_FI_DEV_POWER_VIOLATION": + fallthrough + case "DCGM_FI_DEV_RELIABILITY_VIOLATION": + fallthrough + case "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": + fallthrough + case "DCGM_FI_DEV_THERMAL_VIOLATION": + fallthrough + case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": + fallthrough + case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, time.Now().UnixNano(), name) + case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": + fallthrough + case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": + // arbitrary max of 100000000 errors + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000000)) + case "DCGM_FI_DEV_GPU_TEMP": + // arbitrary max of 100000 °C + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000)) + case "DCGM_FI_DEV_SM_CLOCK": + // arbitrary max of 100000 MHz + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000)) + case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": + value := lastInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + // TODO + case "DCGM_FI_DEV_POWER_USAGE": + value := lastFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + // TODO + default: + t.Errorf("Unexpected metric '%s'", name) + } + + assert.GreaterOrEqual(t, metric.lastFieldValue.Ts, before) + assert.LessOrEqual(t, metric.lastFieldValue.Ts, after) - assert.GreaterOrEqual(t, metric.timestamp, before) - assert.LessOrEqual(t, metric.timestamp, after) + seenMetric[name]++ + } + } - seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", metric.gpuIndex, metric.name)] = true + for name, count := range seenMetric { + assert.Equalf(t, count, len(deviceMetrics), "metric %q found on an unexpected number of GPUs", name) } - for _, gpuIndex := range client.deviceIndices { - for _, metric := range expectedMetrics { - assert.Equal(t, seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", gpuIndex, metric)], true) + allFields := clientSettings.fields + + var enabledFieldsString []string + var unavailableFieldsString []string + for _, f := range allFields { + if seenMetric[f] > 0 { + enabledFieldsString = append(enabledFieldsString, f) + } else { + unavailableFieldsString = append(unavailableFieldsString, f) } } - client.cleanup() + sort.Strings(enabledFieldsString) + sort.Strings(unavailableFieldsString) + gpuModel := client.devices[0].ModelName + m := modelSupportedFields{ + Model: gpuModel, + SupportedFields: enabledFieldsString, + UnsupportedFields: unavailableFieldsString, + } + actual, err := yaml.Marshal(&m) + if err != nil { + t.Fatal(err) + } + goldenPath := getModelGoldenFilePath(t, gpuModel) + golden.Assert(t, string(actual), goldenPath) } diff --git a/receiver/dcgmreceiver/client_test.go b/receiver/dcgmreceiver/client_test.go index 6d390aa60..010929a09 100644 --- a/receiver/dcgmreceiver/client_test.go +++ b/receiver/dcgmreceiver/client_test.go @@ -32,7 +32,7 @@ import ( func TestNewDcgmClientOnInitializationError(t *testing.T) { realDcgmInit := dcgmInit defer func() { dcgmInit = realDcgmInit }() - dcgmInit = func(args ...string) (func(), error) { + dcgmInit = func(...string) (func(), error) { return nil, fmt.Errorf("No DCGM client library *OR* No DCGM connection") } @@ -44,7 +44,7 @@ func TestNewDcgmClientOnInitializationError(t *testing.T) { return nil }))) - client, err := newClient(createDefaultConfig().(*Config), logger) + client, err := newClient(&dcgmClientSettings{endpoint: defaultEndpoint}, logger) assert.Equal(t, seenDcgmConnectionWarning, true) assert.True(t, errors.Is(err, ErrDcgmInitialization)) assert.Regexp(t, ".*Unable to connect.*", err) diff --git a/receiver/dcgmreceiver/component_test.go b/receiver/dcgmreceiver/component_test.go index 18ad1e214..795ef9e5f 100644 --- a/receiver/dcgmreceiver/component_test.go +++ b/receiver/dcgmreceiver/component_test.go @@ -35,6 +35,7 @@ import ( "go.opentelemetry.io/collector/consumer/consumertest" "go.opentelemetry.io/collector/receiver" "go.opentelemetry.io/collector/receiver/receivertest" + "go.uber.org/zap/zaptest" ) func TestComponentFactoryType(t *testing.T) { @@ -45,6 +46,12 @@ func TestComponentConfigStruct(t *testing.T) { require.NoError(t, componenttest.CheckConfigStruct(NewFactory().CreateDefaultConfig())) } +func newCreateSettings(t *testing.T) receiver.CreateSettings { + settings := receivertest.NewNopCreateSettings() + settings.Logger = zaptest.NewLogger(t) + return settings +} + func TestComponentLifecycle(t *testing.T) { factory := NewFactory() @@ -70,19 +77,19 @@ func TestComponentLifecycle(t *testing.T) { for _, test := range tests { t.Run(test.name+"-shutdown", func(t *testing.T) { - c, err := test.createFn(context.Background(), receivertest.NewNopCreateSettings(), cfg) + c, err := test.createFn(context.Background(), newCreateSettings(t), cfg) require.NoError(t, err) err = c.Shutdown(context.Background()) require.NoError(t, err) }) t.Run(test.name+"-lifecycle", func(t *testing.T) { - firstRcvr, err := test.createFn(context.Background(), receivertest.NewNopCreateSettings(), cfg) + firstRcvr, err := test.createFn(context.Background(), newCreateSettings(t), cfg) require.NoError(t, err) host := componenttest.NewNopHost() require.NoError(t, err) require.NoError(t, firstRcvr.Start(context.Background(), host)) require.NoError(t, firstRcvr.Shutdown(context.Background())) - secondRcvr, err := test.createFn(context.Background(), receivertest.NewNopCreateSettings(), cfg) + secondRcvr, err := test.createFn(context.Background(), newCreateSettings(t), cfg) require.NoError(t, err) require.NoError(t, secondRcvr.Start(context.Background(), host)) require.NoError(t, secondRcvr.Shutdown(context.Background())) diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 5151ea5fc..122ded718 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -12,93 +12,119 @@ metrics: enabled: false ``` -### dcgm.gpu.memory.bytes_used +### gpu.dcgm.clock.frequency -Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. +Multiprocessor clock frequency. | Unit | Metric Type | Value Type | | ---- | ----------- | ---------- | -| By | Gauge | Int | +| Hz | Gauge | Double | + +### gpu.dcgm.clock.throttle_duration.time + +Clock throttle total duration. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| s | Sum | Double | Cumulative | true | #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | -| memory_state | GPU memory used or free | Str: ``used``, ``free`` | +| gpu.clock.violation | Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. | Str: ``power``, ``thermal``, ``sync_boost``, ``board_limit``, ``low_util``, ``reliability``, ``app_clock``, ``base_clock`` | -### dcgm.gpu.profiling.dram_utilization +### gpu.dcgm.codec.decoder.utilization -Fraction of cycles data was being sent or received from GPU memory. +Decoder utilization. | Unit | Metric Type | Value Type | | ---- | ----------- | ---------- | | 1 | Gauge | Double | +### gpu.dcgm.codec.encoder.utilization + +Encoder utilization. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.ecc_errors + +Data corruption errors. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| 1 | Sum | Int | Cumulative | true | + #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | +| gpu.error.type | The type of error, one of [sbe, dbe]. | Str: ``sbe``, ``dbe`` | -### dcgm.gpu.profiling.nvlink_traffic_rate +### gpu.dcgm.energy_consumption -The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers. +Total energy consumption for the GPU in J since the driver was last reloaded. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| J | Sum | Double | Cumulative | true | + +### gpu.dcgm.memory.bandwidth_utilization + +Fraction of cycles data was being sent or received from GPU memory. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.memory.bytes_used + +Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. | Unit | Metric Type | Value Type | | ---- | ----------- | ---------- | -| By/s | Gauge | Int | +| By | Gauge | Int | #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | -| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | +| gpu.memory.state | GPU memory state, one of [free, used, reserved]. | Str: ``used``, ``free``, ``reserved`` | -### dcgm.gpu.profiling.pcie_traffic_rate +### gpu.dcgm.nvlink.io -The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads. +The number of bytes sent over NVLink, not including protocol headers. -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| By/s | Gauge | Int | +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| By | Sum | Int | Cumulative | true | #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | -| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | +| network.io.direction | Direction of the link traffic, one of [transmit, receive]. | Str: ``transmit``, ``receive`` | -### dcgm.gpu.profiling.pipe_utilization +### gpu.dcgm.pcie.io -Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. +The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| By | Sum | Int | Cumulative | true | #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | -| pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | +| network.io.direction | Direction of the link traffic, one of [transmit, receive]. | Str: ``transmit``, ``receive`` | -### dcgm.gpu.profiling.sm_occupancy +### gpu.dcgm.pipe.utilization -Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors. +Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. | Unit | Metric Type | Value Type | | ---- | ----------- | ---------- | @@ -108,11 +134,9 @@ Fraction of resident warps on a multiprocessor relative to the maximum number su | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | +| gpu.pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | -### dcgm.gpu.profiling.sm_utilization +### gpu.dcgm.sm.utilization Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. @@ -120,26 +144,58 @@ Fraction of time at least one warp was active on a multiprocessor, averaged over | ---- | ----------- | ---------- | | 1 | Gauge | Double | -#### Attributes +### gpu.dcgm.temperature -| Name | Description | Values | -| ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | +Current temperature readings for the device, in ˚C. -### dcgm.gpu.utilization +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| Cel | Gauge | Double | -Fraction of time the GPU was not idle. +### gpu.dcgm.utilization + +Ratio of time the graphics engine is active. | Unit | Metric Type | Value Type | | ---- | ----------- | ---------- | | 1 | Gauge | Double | +## Optional Metrics + +The following metrics are not emitted by default. Each of them can be enabled by applying the following configuration: + +```yaml +metrics: + : + enabled: true +``` + +### gpu.dcgm.sm.occupancy + +Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.xid_errors + +XID errors. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| 1 | Sum | Int | Cumulative | true | + #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | +| gpu.error.xid | The XID code for the error, 1..143. | Any Int | + +## Resource Attributes + +| Name | Description | Values | Enabled | +| ---- | ----------- | ------ | ------- | +| gpu.model | GPU model name. | Any Str | true | +| gpu.number | GPU index starting at 0. | Any Str | true | +| gpu.uuid | GPU universally unique identifier. | Any Str | true | diff --git a/receiver/dcgmreceiver/factory_gpu_on.go b/receiver/dcgmreceiver/factory_gpu_on.go index 08295aa12..9601940d0 100644 --- a/receiver/dcgmreceiver/factory_gpu_on.go +++ b/receiver/dcgmreceiver/factory_gpu_on.go @@ -21,6 +21,7 @@ import ( "context" "fmt" "math/rand" + "strings" "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" @@ -33,37 +34,16 @@ import ( ) var dcgmIDToName map[dcgm.Short]string -var dcgmNameToMetricName map[string]string -var metricNameToDcgmName map[string]string var randSource = rand.New(rand.NewSource(time.Now().UnixMicro())) func init() { dcgmIDToName = make(map[dcgm.Short]string, len(dcgm.DCGM_FI)) for fieldName, fieldID := range dcgm.DCGM_FI { + if strings.HasPrefix(fieldName, "DCGM_FT_") { + continue + } dcgmIDToName[fieldID] = fieldName } - - dcgmNameToMetricName = map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": "dcgm.gpu.utilization", - "DCGM_FI_DEV_FB_USED": "dcgm.gpu.memory.bytes_used", - "DCGM_FI_DEV_FB_FREE": "dcgm.gpu.memory.bytes_free", - "DCGM_FI_PROF_SM_ACTIVE": "dcgm.gpu.profiling.sm_utilization", - "DCGM_FI_PROF_SM_OCCUPANCY": "dcgm.gpu.profiling.sm_occupancy", - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "dcgm.gpu.profiling.tensor_utilization", - "DCGM_FI_PROF_DRAM_ACTIVE": "dcgm.gpu.profiling.dram_utilization", - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "dcgm.gpu.profiling.fp64_utilization", - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "dcgm.gpu.profiling.fp32_utilization", - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "dcgm.gpu.profiling.fp16_utilization", - "DCGM_FI_PROF_PCIE_TX_BYTES": "dcgm.gpu.profiling.pcie_sent_bytes", - "DCGM_FI_PROF_PCIE_RX_BYTES": "dcgm.gpu.profiling.pcie_received_bytes", - "DCGM_FI_PROF_NVLINK_TX_BYTES": "dcgm.gpu.profiling.nvlink_sent_bytes", - "DCGM_FI_PROF_NVLINK_RX_BYTES": "dcgm.gpu.profiling.nvlink_received_bytes", - } - - metricNameToDcgmName = make(map[string]string, len(dcgmNameToMetricName)) - for dcgmName, metricName := range dcgmNameToMetricName { - metricNameToDcgmName[metricName] = dcgmName - } } func createMetricsReceiver( diff --git a/receiver/dcgmreceiver/generated_package_test.go b/receiver/dcgmreceiver/generated_package_test.go index 90d299c5c..0da0bc8c9 100644 --- a/receiver/dcgmreceiver/generated_package_test.go +++ b/receiver/dcgmreceiver/generated_package_test.go @@ -3,8 +3,9 @@ package dcgmreceiver import ( - "go.uber.org/goleak" "testing" + + "go.uber.org/goleak" ) func TestMain(m *testing.M) { diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index ec7383f79..bb5070e70 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -4,6 +4,7 @@ package metadata import ( "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/filter" ) // MetricConfig provides common config for a particular metric. @@ -27,40 +28,119 @@ func (ms *MetricConfig) Unmarshal(parser *confmap.Conf) error { // MetricsConfig provides config for dcgm metrics. type MetricsConfig struct { - DcgmGpuMemoryBytesUsed MetricConfig `mapstructure:"dcgm.gpu.memory.bytes_used"` - DcgmGpuProfilingDramUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.dram_utilization"` - DcgmGpuProfilingNvlinkTrafficRate MetricConfig `mapstructure:"dcgm.gpu.profiling.nvlink_traffic_rate"` - DcgmGpuProfilingPcieTrafficRate MetricConfig `mapstructure:"dcgm.gpu.profiling.pcie_traffic_rate"` - DcgmGpuProfilingPipeUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.pipe_utilization"` - DcgmGpuProfilingSmOccupancy MetricConfig `mapstructure:"dcgm.gpu.profiling.sm_occupancy"` - DcgmGpuProfilingSmUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.sm_utilization"` - DcgmGpuUtilization MetricConfig `mapstructure:"dcgm.gpu.utilization"` + GpuDcgmClockFrequency MetricConfig `mapstructure:"gpu.dcgm.clock.frequency"` + GpuDcgmClockThrottleDurationTime MetricConfig `mapstructure:"gpu.dcgm.clock.throttle_duration.time"` + GpuDcgmCodecDecoderUtilization MetricConfig `mapstructure:"gpu.dcgm.codec.decoder.utilization"` + GpuDcgmCodecEncoderUtilization MetricConfig `mapstructure:"gpu.dcgm.codec.encoder.utilization"` + GpuDcgmEccErrors MetricConfig `mapstructure:"gpu.dcgm.ecc_errors"` + GpuDcgmEnergyConsumption MetricConfig `mapstructure:"gpu.dcgm.energy_consumption"` + GpuDcgmMemoryBandwidthUtilization MetricConfig `mapstructure:"gpu.dcgm.memory.bandwidth_utilization"` + GpuDcgmMemoryBytesUsed MetricConfig `mapstructure:"gpu.dcgm.memory.bytes_used"` + GpuDcgmNvlinkIo MetricConfig `mapstructure:"gpu.dcgm.nvlink.io"` + GpuDcgmPcieIo MetricConfig `mapstructure:"gpu.dcgm.pcie.io"` + GpuDcgmPipeUtilization MetricConfig `mapstructure:"gpu.dcgm.pipe.utilization"` + GpuDcgmSmOccupancy MetricConfig `mapstructure:"gpu.dcgm.sm.occupancy"` + GpuDcgmSmUtilization MetricConfig `mapstructure:"gpu.dcgm.sm.utilization"` + GpuDcgmTemperature MetricConfig `mapstructure:"gpu.dcgm.temperature"` + GpuDcgmUtilization MetricConfig `mapstructure:"gpu.dcgm.utilization"` + GpuDcgmXidErrors MetricConfig `mapstructure:"gpu.dcgm.xid_errors"` } func DefaultMetricsConfig() MetricsConfig { return MetricsConfig{ - DcgmGpuMemoryBytesUsed: MetricConfig{ + GpuDcgmClockFrequency: MetricConfig{ Enabled: true, }, - DcgmGpuProfilingDramUtilization: MetricConfig{ + GpuDcgmClockThrottleDurationTime: MetricConfig{ Enabled: true, }, - DcgmGpuProfilingNvlinkTrafficRate: MetricConfig{ + GpuDcgmCodecDecoderUtilization: MetricConfig{ Enabled: true, }, - DcgmGpuProfilingPcieTrafficRate: MetricConfig{ + GpuDcgmCodecEncoderUtilization: MetricConfig{ Enabled: true, }, - DcgmGpuProfilingPipeUtilization: MetricConfig{ + GpuDcgmEccErrors: MetricConfig{ Enabled: true, }, - DcgmGpuProfilingSmOccupancy: MetricConfig{ + GpuDcgmEnergyConsumption: MetricConfig{ Enabled: true, }, - DcgmGpuProfilingSmUtilization: MetricConfig{ + GpuDcgmMemoryBandwidthUtilization: MetricConfig{ Enabled: true, }, - DcgmGpuUtilization: MetricConfig{ + GpuDcgmMemoryBytesUsed: MetricConfig{ + Enabled: true, + }, + GpuDcgmNvlinkIo: MetricConfig{ + Enabled: true, + }, + GpuDcgmPcieIo: MetricConfig{ + Enabled: true, + }, + GpuDcgmPipeUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmSmOccupancy: MetricConfig{ + Enabled: false, + }, + GpuDcgmSmUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmTemperature: MetricConfig{ + Enabled: true, + }, + GpuDcgmUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmXidErrors: MetricConfig{ + Enabled: false, + }, + } +} + +// ResourceAttributeConfig provides common config for a particular resource attribute. +type ResourceAttributeConfig struct { + Enabled bool `mapstructure:"enabled"` + // Experimental: MetricsInclude defines a list of filters for attribute values. + // If the list is not empty, only metrics with matching resource attribute values will be emitted. + MetricsInclude []filter.Config `mapstructure:"metrics_include"` + // Experimental: MetricsExclude defines a list of filters for attribute values. + // If the list is not empty, metrics with matching resource attribute values will not be emitted. + // MetricsInclude has higher priority than MetricsExclude. + MetricsExclude []filter.Config `mapstructure:"metrics_exclude"` + + enabledSetByUser bool +} + +func (rac *ResourceAttributeConfig) Unmarshal(parser *confmap.Conf) error { + if parser == nil { + return nil + } + err := parser.Unmarshal(rac) + if err != nil { + return err + } + rac.enabledSetByUser = parser.IsSet("enabled") + return nil +} + +// ResourceAttributesConfig provides config for dcgm resource attributes. +type ResourceAttributesConfig struct { + GpuModel ResourceAttributeConfig `mapstructure:"gpu.model"` + GpuNumber ResourceAttributeConfig `mapstructure:"gpu.number"` + GpuUUID ResourceAttributeConfig `mapstructure:"gpu.uuid"` +} + +func DefaultResourceAttributesConfig() ResourceAttributesConfig { + return ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{ + Enabled: true, + }, + GpuNumber: ResourceAttributeConfig{ + Enabled: true, + }, + GpuUUID: ResourceAttributeConfig{ Enabled: true, }, } @@ -68,11 +148,13 @@ func DefaultMetricsConfig() MetricsConfig { // MetricsBuilderConfig is a configuration for dcgm metrics builder. type MetricsBuilderConfig struct { - Metrics MetricsConfig `mapstructure:"metrics"` + Metrics MetricsConfig `mapstructure:"metrics"` + ResourceAttributes ResourceAttributesConfig `mapstructure:"resource_attributes"` } func DefaultMetricsBuilderConfig() MetricsBuilderConfig { return MetricsBuilderConfig{ - Metrics: DefaultMetricsConfig(), + Metrics: DefaultMetricsConfig(), + ResourceAttributes: DefaultResourceAttributesConfig(), } } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go index 61c444bbb..ca1405d53 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go @@ -9,6 +9,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap/confmaptest" ) @@ -25,14 +26,27 @@ func TestMetricsBuilderConfig(t *testing.T) { name: "all_set", want: MetricsBuilderConfig{ Metrics: MetricsConfig{ - DcgmGpuMemoryBytesUsed: MetricConfig{Enabled: true}, - DcgmGpuProfilingDramUtilization: MetricConfig{Enabled: true}, - DcgmGpuProfilingNvlinkTrafficRate: MetricConfig{Enabled: true}, - DcgmGpuProfilingPcieTrafficRate: MetricConfig{Enabled: true}, - DcgmGpuProfilingPipeUtilization: MetricConfig{Enabled: true}, - DcgmGpuProfilingSmOccupancy: MetricConfig{Enabled: true}, - DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: true}, - DcgmGpuUtilization: MetricConfig{Enabled: true}, + GpuDcgmClockFrequency: MetricConfig{Enabled: true}, + GpuDcgmClockThrottleDurationTime: MetricConfig{Enabled: true}, + GpuDcgmCodecDecoderUtilization: MetricConfig{Enabled: true}, + GpuDcgmCodecEncoderUtilization: MetricConfig{Enabled: true}, + GpuDcgmEccErrors: MetricConfig{Enabled: true}, + GpuDcgmEnergyConsumption: MetricConfig{Enabled: true}, + GpuDcgmMemoryBandwidthUtilization: MetricConfig{Enabled: true}, + GpuDcgmMemoryBytesUsed: MetricConfig{Enabled: true}, + GpuDcgmNvlinkIo: MetricConfig{Enabled: true}, + GpuDcgmPcieIo: MetricConfig{Enabled: true}, + GpuDcgmPipeUtilization: MetricConfig{Enabled: true}, + GpuDcgmSmOccupancy: MetricConfig{Enabled: true}, + GpuDcgmSmUtilization: MetricConfig{Enabled: true}, + GpuDcgmTemperature: MetricConfig{Enabled: true}, + GpuDcgmUtilization: MetricConfig{Enabled: true}, + GpuDcgmXidErrors: MetricConfig{Enabled: true}, + }, + ResourceAttributes: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: true}, + GpuNumber: ResourceAttributeConfig{Enabled: true}, + GpuUUID: ResourceAttributeConfig{Enabled: true}, }, }, }, @@ -40,14 +54,27 @@ func TestMetricsBuilderConfig(t *testing.T) { name: "none_set", want: MetricsBuilderConfig{ Metrics: MetricsConfig{ - DcgmGpuMemoryBytesUsed: MetricConfig{Enabled: false}, - DcgmGpuProfilingDramUtilization: MetricConfig{Enabled: false}, - DcgmGpuProfilingNvlinkTrafficRate: MetricConfig{Enabled: false}, - DcgmGpuProfilingPcieTrafficRate: MetricConfig{Enabled: false}, - DcgmGpuProfilingPipeUtilization: MetricConfig{Enabled: false}, - DcgmGpuProfilingSmOccupancy: MetricConfig{Enabled: false}, - DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: false}, - DcgmGpuUtilization: MetricConfig{Enabled: false}, + GpuDcgmClockFrequency: MetricConfig{Enabled: false}, + GpuDcgmClockThrottleDurationTime: MetricConfig{Enabled: false}, + GpuDcgmCodecDecoderUtilization: MetricConfig{Enabled: false}, + GpuDcgmCodecEncoderUtilization: MetricConfig{Enabled: false}, + GpuDcgmEccErrors: MetricConfig{Enabled: false}, + GpuDcgmEnergyConsumption: MetricConfig{Enabled: false}, + GpuDcgmMemoryBandwidthUtilization: MetricConfig{Enabled: false}, + GpuDcgmMemoryBytesUsed: MetricConfig{Enabled: false}, + GpuDcgmNvlinkIo: MetricConfig{Enabled: false}, + GpuDcgmPcieIo: MetricConfig{Enabled: false}, + GpuDcgmPipeUtilization: MetricConfig{Enabled: false}, + GpuDcgmSmOccupancy: MetricConfig{Enabled: false}, + GpuDcgmSmUtilization: MetricConfig{Enabled: false}, + GpuDcgmTemperature: MetricConfig{Enabled: false}, + GpuDcgmUtilization: MetricConfig{Enabled: false}, + GpuDcgmXidErrors: MetricConfig{Enabled: false}, + }, + ResourceAttributes: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: false}, + GpuNumber: ResourceAttributeConfig{Enabled: false}, + GpuUUID: ResourceAttributeConfig{Enabled: false}, }, }, }, @@ -55,7 +82,7 @@ func TestMetricsBuilderConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { cfg := loadMetricsBuilderConfig(t, tt.name) - if diff := cmp.Diff(tt.want, cfg, cmpopts.IgnoreUnexported(MetricConfig{})); diff != "" { + if diff := cmp.Diff(tt.want, cfg, cmpopts.IgnoreUnexported(MetricConfig{}, ResourceAttributeConfig{})); diff != "" { t.Errorf("Config mismatch (-expected +actual):\n%s", diff) } }) @@ -68,6 +95,54 @@ func loadMetricsBuilderConfig(t *testing.T, name string) MetricsBuilderConfig { sub, err := cm.Sub(name) require.NoError(t, err) cfg := DefaultMetricsBuilderConfig() - require.NoError(t, sub.Unmarshal(&cfg)) + require.NoError(t, component.UnmarshalConfig(sub, &cfg)) + return cfg +} + +func TestResourceAttributesConfig(t *testing.T) { + tests := []struct { + name string + want ResourceAttributesConfig + }{ + { + name: "default", + want: DefaultResourceAttributesConfig(), + }, + { + name: "all_set", + want: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: true}, + GpuNumber: ResourceAttributeConfig{Enabled: true}, + GpuUUID: ResourceAttributeConfig{Enabled: true}, + }, + }, + { + name: "none_set", + want: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: false}, + GpuNumber: ResourceAttributeConfig{Enabled: false}, + GpuUUID: ResourceAttributeConfig{Enabled: false}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := loadResourceAttributesConfig(t, tt.name) + if diff := cmp.Diff(tt.want, cfg, cmpopts.IgnoreUnexported(ResourceAttributeConfig{})); diff != "" { + t.Errorf("Config mismatch (-expected +actual):\n%s", diff) + } + }) + } +} + +func loadResourceAttributesConfig(t *testing.T, name string) ResourceAttributesConfig { + cm, err := confmaptest.LoadConf(filepath.Join("testdata", "config.yaml")) + require.NoError(t, err) + sub, err := cm.Sub(name) + require.NoError(t, err) + sub, err = sub.Sub("resource_attributes") + require.NoError(t, err) + cfg := DefaultResourceAttributesConfig() + require.NoError(t, component.UnmarshalConfig(sub, &cfg)) return cfg } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go index 6d681ee43..435157c38 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go @@ -6,135 +6,211 @@ import ( "time" "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/filter" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver" ) -// AttributeDirection specifies the a value direction attribute. -type AttributeDirection int +// AttributeGpuClockViolation specifies the a value gpu.clock.violation attribute. +type AttributeGpuClockViolation int const ( - _ AttributeDirection = iota - AttributeDirectionTx - AttributeDirectionRx + _ AttributeGpuClockViolation = iota + AttributeGpuClockViolationPower + AttributeGpuClockViolationThermal + AttributeGpuClockViolationSyncBoost + AttributeGpuClockViolationBoardLimit + AttributeGpuClockViolationLowUtil + AttributeGpuClockViolationReliability + AttributeGpuClockViolationAppClock + AttributeGpuClockViolationBaseClock ) -// String returns the string representation of the AttributeDirection. -func (av AttributeDirection) String() string { +// String returns the string representation of the AttributeGpuClockViolation. +func (av AttributeGpuClockViolation) String() string { switch av { - case AttributeDirectionTx: - return "tx" - case AttributeDirectionRx: - return "rx" + case AttributeGpuClockViolationPower: + return "power" + case AttributeGpuClockViolationThermal: + return "thermal" + case AttributeGpuClockViolationSyncBoost: + return "sync_boost" + case AttributeGpuClockViolationBoardLimit: + return "board_limit" + case AttributeGpuClockViolationLowUtil: + return "low_util" + case AttributeGpuClockViolationReliability: + return "reliability" + case AttributeGpuClockViolationAppClock: + return "app_clock" + case AttributeGpuClockViolationBaseClock: + return "base_clock" } return "" } -// MapAttributeDirection is a helper map of string to AttributeDirection attribute value. -var MapAttributeDirection = map[string]AttributeDirection{ - "tx": AttributeDirectionTx, - "rx": AttributeDirectionRx, +// MapAttributeGpuClockViolation is a helper map of string to AttributeGpuClockViolation attribute value. +var MapAttributeGpuClockViolation = map[string]AttributeGpuClockViolation{ + "power": AttributeGpuClockViolationPower, + "thermal": AttributeGpuClockViolationThermal, + "sync_boost": AttributeGpuClockViolationSyncBoost, + "board_limit": AttributeGpuClockViolationBoardLimit, + "low_util": AttributeGpuClockViolationLowUtil, + "reliability": AttributeGpuClockViolationReliability, + "app_clock": AttributeGpuClockViolationAppClock, + "base_clock": AttributeGpuClockViolationBaseClock, } -// AttributeMemoryState specifies the a value memory_state attribute. -type AttributeMemoryState int +// AttributeGpuErrorType specifies the a value gpu.error.type attribute. +type AttributeGpuErrorType int const ( - _ AttributeMemoryState = iota - AttributeMemoryStateUsed - AttributeMemoryStateFree + _ AttributeGpuErrorType = iota + AttributeGpuErrorTypeSbe + AttributeGpuErrorTypeDbe ) -// String returns the string representation of the AttributeMemoryState. -func (av AttributeMemoryState) String() string { +// String returns the string representation of the AttributeGpuErrorType. +func (av AttributeGpuErrorType) String() string { switch av { - case AttributeMemoryStateUsed: + case AttributeGpuErrorTypeSbe: + return "sbe" + case AttributeGpuErrorTypeDbe: + return "dbe" + } + return "" +} + +// MapAttributeGpuErrorType is a helper map of string to AttributeGpuErrorType attribute value. +var MapAttributeGpuErrorType = map[string]AttributeGpuErrorType{ + "sbe": AttributeGpuErrorTypeSbe, + "dbe": AttributeGpuErrorTypeDbe, +} + +// AttributeGpuMemoryState specifies the a value gpu.memory.state attribute. +type AttributeGpuMemoryState int + +const ( + _ AttributeGpuMemoryState = iota + AttributeGpuMemoryStateUsed + AttributeGpuMemoryStateFree + AttributeGpuMemoryStateReserved +) + +// String returns the string representation of the AttributeGpuMemoryState. +func (av AttributeGpuMemoryState) String() string { + switch av { + case AttributeGpuMemoryStateUsed: return "used" - case AttributeMemoryStateFree: + case AttributeGpuMemoryStateFree: return "free" + case AttributeGpuMemoryStateReserved: + return "reserved" } return "" } -// MapAttributeMemoryState is a helper map of string to AttributeMemoryState attribute value. -var MapAttributeMemoryState = map[string]AttributeMemoryState{ - "used": AttributeMemoryStateUsed, - "free": AttributeMemoryStateFree, +// MapAttributeGpuMemoryState is a helper map of string to AttributeGpuMemoryState attribute value. +var MapAttributeGpuMemoryState = map[string]AttributeGpuMemoryState{ + "used": AttributeGpuMemoryStateUsed, + "free": AttributeGpuMemoryStateFree, + "reserved": AttributeGpuMemoryStateReserved, } -// AttributePipe specifies the a value pipe attribute. -type AttributePipe int +// AttributeGpuPipe specifies the a value gpu.pipe attribute. +type AttributeGpuPipe int const ( - _ AttributePipe = iota - AttributePipeTensor - AttributePipeFp64 - AttributePipeFp32 - AttributePipeFp16 + _ AttributeGpuPipe = iota + AttributeGpuPipeTensor + AttributeGpuPipeFp64 + AttributeGpuPipeFp32 + AttributeGpuPipeFp16 ) -// String returns the string representation of the AttributePipe. -func (av AttributePipe) String() string { +// String returns the string representation of the AttributeGpuPipe. +func (av AttributeGpuPipe) String() string { switch av { - case AttributePipeTensor: + case AttributeGpuPipeTensor: return "tensor" - case AttributePipeFp64: + case AttributeGpuPipeFp64: return "fp64" - case AttributePipeFp32: + case AttributeGpuPipeFp32: return "fp32" - case AttributePipeFp16: + case AttributeGpuPipeFp16: return "fp16" } return "" } -// MapAttributePipe is a helper map of string to AttributePipe attribute value. -var MapAttributePipe = map[string]AttributePipe{ - "tensor": AttributePipeTensor, - "fp64": AttributePipeFp64, - "fp32": AttributePipeFp32, - "fp16": AttributePipeFp16, +// MapAttributeGpuPipe is a helper map of string to AttributeGpuPipe attribute value. +var MapAttributeGpuPipe = map[string]AttributeGpuPipe{ + "tensor": AttributeGpuPipeTensor, + "fp64": AttributeGpuPipeFp64, + "fp32": AttributeGpuPipeFp32, + "fp16": AttributeGpuPipeFp16, +} + +// AttributeNetworkIoDirection specifies the a value network.io.direction attribute. +type AttributeNetworkIoDirection int + +const ( + _ AttributeNetworkIoDirection = iota + AttributeNetworkIoDirectionTransmit + AttributeNetworkIoDirectionReceive +) + +// String returns the string representation of the AttributeNetworkIoDirection. +func (av AttributeNetworkIoDirection) String() string { + switch av { + case AttributeNetworkIoDirectionTransmit: + return "transmit" + case AttributeNetworkIoDirectionReceive: + return "receive" + } + return "" +} + +// MapAttributeNetworkIoDirection is a helper map of string to AttributeNetworkIoDirection attribute value. +var MapAttributeNetworkIoDirection = map[string]AttributeNetworkIoDirection{ + "transmit": AttributeNetworkIoDirectionTransmit, + "receive": AttributeNetworkIoDirectionReceive, } -type metricDcgmGpuMemoryBytesUsed struct { +type metricGpuDcgmClockFrequency struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.memory.bytes_used metric with initial data. -func (m *metricDcgmGpuMemoryBytesUsed) init() { - m.data.SetName("dcgm.gpu.memory.bytes_used") - m.data.SetDescription("Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.") - m.data.SetUnit("By") +// init fills gpu.dcgm.clock.frequency metric with initial data. +func (m *metricGpuDcgmClockFrequency) init() { + m.data.SetName("gpu.dcgm.clock.frequency") + m.data.SetDescription("Multiprocessor clock frequency.") + m.data.SetUnit("Hz") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, memoryStateAttributeValue string) { +func (m *metricGpuDcgmClockFrequency) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } dp := m.data.Gauge().DataPoints().AppendEmpty() dp.SetStartTimestamp(start) dp.SetTimestamp(ts) - dp.SetIntValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) - dp.Attributes().PutStr("memory_state", memoryStateAttributeValue) + dp.SetDoubleValue(val) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuMemoryBytesUsed) updateCapacity() { +func (m *metricGpuDcgmClockFrequency) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmClockFrequency) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -142,8 +218,8 @@ func (m *metricDcgmGpuMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { } } -func newMetricDcgmGpuMemoryBytesUsed(cfg MetricConfig) metricDcgmGpuMemoryBytesUsed { - m := metricDcgmGpuMemoryBytesUsed{config: cfg} +func newMetricGpuDcgmClockFrequency(cfg MetricConfig) metricGpuDcgmClockFrequency { + m := metricGpuDcgmClockFrequency{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -151,22 +227,74 @@ func newMetricDcgmGpuMemoryBytesUsed(cfg MetricConfig) metricDcgmGpuMemoryBytesU return m } -type metricDcgmGpuProfilingDramUtilization struct { +type metricGpuDcgmClockThrottleDurationTime struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.dram_utilization metric with initial data. -func (m *metricDcgmGpuProfilingDramUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.dram_utilization") - m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") +// init fills gpu.dcgm.clock.throttle_duration.time metric with initial data. +func (m *metricGpuDcgmClockThrottleDurationTime) init() { + m.data.SetName("gpu.dcgm.clock.throttle_duration.time") + m.data.SetDescription("Clock throttle total duration.") + m.data.SetUnit("s") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmClockThrottleDurationTime) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, gpuClockViolationAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) + dp.Attributes().PutStr("gpu.clock.violation", gpuClockViolationAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmClockThrottleDurationTime) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmClockThrottleDurationTime) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmClockThrottleDurationTime(cfg MetricConfig) metricGpuDcgmClockThrottleDurationTime { + m := metricGpuDcgmClockThrottleDurationTime{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmCodecDecoderUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.codec.decoder.utilization metric with initial data. +func (m *metricGpuDcgmCodecDecoderUtilization) init() { + m.data.SetName("gpu.dcgm.codec.decoder.utilization") + m.data.SetDescription("Decoder utilization.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricGpuDcgmCodecDecoderUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -174,20 +302,17 @@ func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingDramUtilization) updateCapacity() { +func (m *metricGpuDcgmCodecDecoderUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingDramUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmCodecDecoderUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -195,8 +320,8 @@ func (m *metricDcgmGpuProfilingDramUtilization) emit(metrics pmetric.MetricSlice } } -func newMetricDcgmGpuProfilingDramUtilization(cfg MetricConfig) metricDcgmGpuProfilingDramUtilization { - m := metricDcgmGpuProfilingDramUtilization{config: cfg} +func newMetricGpuDcgmCodecDecoderUtilization(cfg MetricConfig) metricGpuDcgmCodecDecoderUtilization { + m := metricGpuDcgmCodecDecoderUtilization{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -204,44 +329,192 @@ func newMetricDcgmGpuProfilingDramUtilization(cfg MetricConfig) metricDcgmGpuPro return m } -type metricDcgmGpuProfilingNvlinkTrafficRate struct { +type metricGpuDcgmCodecEncoderUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.nvlink_traffic_rate metric with initial data. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) init() { - m.data.SetName("dcgm.gpu.profiling.nvlink_traffic_rate") - m.data.SetDescription("The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers.") - m.data.SetUnit("By/s") +// init fills gpu.dcgm.codec.encoder.utilization metric with initial data. +func (m *metricGpuDcgmCodecEncoderUtilization) init() { + m.data.SetName("gpu.dcgm.codec.encoder.utilization") + m.data.SetDescription("Encoder utilization.") + m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue string) { +func (m *metricGpuDcgmCodecEncoderUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } dp := m.data.Gauge().DataPoints().AppendEmpty() dp.SetStartTimestamp(start) dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmCodecEncoderUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmCodecEncoderUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmCodecEncoderUtilization(cfg MetricConfig) metricGpuDcgmCodecEncoderUtilization { + m := metricGpuDcgmCodecEncoderUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmEccErrors struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.ecc_errors metric with initial data. +func (m *metricGpuDcgmEccErrors) init() { + m.data.SetName("gpu.dcgm.ecc_errors") + m.data.SetDescription("Data corruption errors.") + m.data.SetUnit("1") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmEccErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, gpuErrorTypeAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) - dp.Attributes().PutStr("direction", directionAttributeValue) + dp.Attributes().PutStr("gpu.error.type", gpuErrorTypeAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmEccErrors) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmEccErrors) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmEccErrors(cfg MetricConfig) metricGpuDcgmEccErrors { + m := metricGpuDcgmEccErrors{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmEnergyConsumption struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.energy_consumption metric with initial data. +func (m *metricGpuDcgmEnergyConsumption) init() { + m.data.SetName("gpu.dcgm.energy_consumption") + m.data.SetDescription("Total energy consumption for the GPU in J since the driver was last reloaded.") + m.data.SetUnit("J") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) +} + +func (m *metricGpuDcgmEnergyConsumption) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) updateCapacity() { +func (m *metricGpuDcgmEnergyConsumption) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmEnergyConsumption) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmEnergyConsumption(cfg MetricConfig) metricGpuDcgmEnergyConsumption { + m := metricGpuDcgmEnergyConsumption{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmMemoryBandwidthUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.memory.bandwidth_utilization metric with initial data. +func (m *metricGpuDcgmMemoryBandwidthUtilization) init() { + m.data.SetName("gpu.dcgm.memory.bandwidth_utilization") + m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricGpuDcgmMemoryBandwidthUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmMemoryBandwidthUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmMemoryBandwidthUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -249,8 +522,8 @@ func (m *metricDcgmGpuProfilingNvlinkTrafficRate) emit(metrics pmetric.MetricSli } } -func newMetricDcgmGpuProfilingNvlinkTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingNvlinkTrafficRate { - m := metricDcgmGpuProfilingNvlinkTrafficRate{config: cfg} +func newMetricGpuDcgmMemoryBandwidthUtilization(cfg MetricConfig) metricGpuDcgmMemoryBandwidthUtilization { + m := metricGpuDcgmMemoryBandwidthUtilization{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -258,22 +531,22 @@ func newMetricDcgmGpuProfilingNvlinkTrafficRate(cfg MetricConfig) metricDcgmGpuP return m } -type metricDcgmGpuProfilingPcieTrafficRate struct { +type metricGpuDcgmMemoryBytesUsed struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.pcie_traffic_rate metric with initial data. -func (m *metricDcgmGpuProfilingPcieTrafficRate) init() { - m.data.SetName("dcgm.gpu.profiling.pcie_traffic_rate") - m.data.SetDescription("The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads.") - m.data.SetUnit("By/s") +// init fills gpu.dcgm.memory.bytes_used metric with initial data. +func (m *metricGpuDcgmMemoryBytesUsed) init() { + m.data.SetName("gpu.dcgm.memory.bytes_used") + m.data.SetDescription("Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.") + m.data.SetUnit("By") m.data.SetEmptyGauge() m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue string) { +func (m *metricGpuDcgmMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, gpuMemoryStateAttributeValue string) { if !m.config.Enabled { return } @@ -281,21 +554,18 @@ func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) - dp.Attributes().PutStr("direction", directionAttributeValue) + dp.Attributes().PutStr("gpu.memory.state", gpuMemoryStateAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingPcieTrafficRate) updateCapacity() { +func (m *metricGpuDcgmMemoryBytesUsed) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingPcieTrafficRate) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -303,8 +573,114 @@ func (m *metricDcgmGpuProfilingPcieTrafficRate) emit(metrics pmetric.MetricSlice } } -func newMetricDcgmGpuProfilingPcieTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingPcieTrafficRate { - m := metricDcgmGpuProfilingPcieTrafficRate{config: cfg} +func newMetricGpuDcgmMemoryBytesUsed(cfg MetricConfig) metricGpuDcgmMemoryBytesUsed { + m := metricGpuDcgmMemoryBytesUsed{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmNvlinkIo struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.nvlink.io metric with initial data. +func (m *metricGpuDcgmNvlinkIo) init() { + m.data.SetName("gpu.dcgm.nvlink.io") + m.data.SetDescription("The number of bytes sent over NVLink, not including protocol headers.") + m.data.SetUnit("By") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmNvlinkIo) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("network.io.direction", networkIoDirectionAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmNvlinkIo) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmNvlinkIo) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmNvlinkIo(cfg MetricConfig) metricGpuDcgmNvlinkIo { + m := metricGpuDcgmNvlinkIo{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmPcieIo struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.pcie.io metric with initial data. +func (m *metricGpuDcgmPcieIo) init() { + m.data.SetName("gpu.dcgm.pcie.io") + m.data.SetDescription("The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.") + m.data.SetUnit("By") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmPcieIo) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("network.io.direction", networkIoDirectionAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmPcieIo) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmPcieIo) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmPcieIo(cfg MetricConfig) metricGpuDcgmPcieIo { + m := metricGpuDcgmPcieIo{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -312,22 +688,22 @@ func newMetricDcgmGpuProfilingPcieTrafficRate(cfg MetricConfig) metricDcgmGpuPro return m } -type metricDcgmGpuProfilingPipeUtilization struct { +type metricGpuDcgmPipeUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.pipe_utilization metric with initial data. -func (m *metricDcgmGpuProfilingPipeUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.pipe_utilization") +// init fills gpu.dcgm.pipe.utilization metric with initial data. +func (m *metricGpuDcgmPipeUtilization) init() { + m.data.SetName("gpu.dcgm.pipe.utilization") m.data.SetDescription("Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, pipeAttributeValue string) { +func (m *metricGpuDcgmPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, gpuPipeAttributeValue string) { if !m.config.Enabled { return } @@ -335,21 +711,18 @@ func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) - dp.Attributes().PutStr("pipe", pipeAttributeValue) + dp.Attributes().PutStr("gpu.pipe", gpuPipeAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingPipeUtilization) updateCapacity() { +func (m *metricGpuDcgmPipeUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingPipeUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmPipeUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -357,8 +730,8 @@ func (m *metricDcgmGpuProfilingPipeUtilization) emit(metrics pmetric.MetricSlice } } -func newMetricDcgmGpuProfilingPipeUtilization(cfg MetricConfig) metricDcgmGpuProfilingPipeUtilization { - m := metricDcgmGpuProfilingPipeUtilization{config: cfg} +func newMetricGpuDcgmPipeUtilization(cfg MetricConfig) metricGpuDcgmPipeUtilization { + m := metricGpuDcgmPipeUtilization{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -366,22 +739,21 @@ func newMetricDcgmGpuProfilingPipeUtilization(cfg MetricConfig) metricDcgmGpuPro return m } -type metricDcgmGpuProfilingSmOccupancy struct { +type metricGpuDcgmSmOccupancy struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.sm_occupancy metric with initial data. -func (m *metricDcgmGpuProfilingSmOccupancy) init() { - m.data.SetName("dcgm.gpu.profiling.sm_occupancy") - m.data.SetDescription("Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.") +// init fills gpu.dcgm.sm.occupancy metric with initial data. +func (m *metricGpuDcgmSmOccupancy) init() { + m.data.SetName("gpu.dcgm.sm.occupancy") + m.data.SetDescription("Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricGpuDcgmSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -389,20 +761,17 @@ func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timest dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingSmOccupancy) updateCapacity() { +func (m *metricGpuDcgmSmOccupancy) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingSmOccupancy) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmSmOccupancy) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -410,8 +779,8 @@ func (m *metricDcgmGpuProfilingSmOccupancy) emit(metrics pmetric.MetricSlice) { } } -func newMetricDcgmGpuProfilingSmOccupancy(cfg MetricConfig) metricDcgmGpuProfilingSmOccupancy { - m := metricDcgmGpuProfilingSmOccupancy{config: cfg} +func newMetricGpuDcgmSmOccupancy(cfg MetricConfig) metricGpuDcgmSmOccupancy { + m := metricGpuDcgmSmOccupancy{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -419,22 +788,70 @@ func newMetricDcgmGpuProfilingSmOccupancy(cfg MetricConfig) metricDcgmGpuProfili return m } -type metricDcgmGpuProfilingSmUtilization struct { +type metricGpuDcgmSmUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.sm_utilization metric with initial data. -func (m *metricDcgmGpuProfilingSmUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.sm_utilization") +// init fills gpu.dcgm.sm.utilization metric with initial data. +func (m *metricGpuDcgmSmUtilization) init() { + m.data.SetName("gpu.dcgm.sm.utilization") m.data.SetDescription("Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricGpuDcgmSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmSmUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmSmUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmSmUtilization(cfg MetricConfig) metricGpuDcgmSmUtilization { + m := metricGpuDcgmSmUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmTemperature struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.temperature metric with initial data. +func (m *metricGpuDcgmTemperature) init() { + m.data.SetName("gpu.dcgm.temperature") + m.data.SetDescription("Current temperature readings for the device, in ˚C.") + m.data.SetUnit("Cel") + m.data.SetEmptyGauge() +} + +func (m *metricGpuDcgmTemperature) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -442,20 +859,17 @@ func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Time dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingSmUtilization) updateCapacity() { +func (m *metricGpuDcgmTemperature) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingSmUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmTemperature) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -463,8 +877,8 @@ func (m *metricDcgmGpuProfilingSmUtilization) emit(metrics pmetric.MetricSlice) } } -func newMetricDcgmGpuProfilingSmUtilization(cfg MetricConfig) metricDcgmGpuProfilingSmUtilization { - m := metricDcgmGpuProfilingSmUtilization{config: cfg} +func newMetricGpuDcgmTemperature(cfg MetricConfig) metricGpuDcgmTemperature { + m := metricGpuDcgmTemperature{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -472,22 +886,21 @@ func newMetricDcgmGpuProfilingSmUtilization(cfg MetricConfig) metricDcgmGpuProfi return m } -type metricDcgmGpuUtilization struct { +type metricGpuDcgmUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.utilization metric with initial data. -func (m *metricDcgmGpuUtilization) init() { - m.data.SetName("dcgm.gpu.utilization") - m.data.SetDescription("Fraction of time the GPU was not idle.") +// init fills gpu.dcgm.utilization metric with initial data. +func (m *metricGpuDcgmUtilization) init() { + m.data.SetName("gpu.dcgm.utilization") + m.data.SetDescription("Ratio of time the graphics engine is active.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricGpuDcgmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -495,20 +908,17 @@ func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts p dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuUtilization) updateCapacity() { +func (m *metricGpuDcgmUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -516,8 +926,61 @@ func (m *metricDcgmGpuUtilization) emit(metrics pmetric.MetricSlice) { } } -func newMetricDcgmGpuUtilization(cfg MetricConfig) metricDcgmGpuUtilization { - m := metricDcgmGpuUtilization{config: cfg} +func newMetricGpuDcgmUtilization(cfg MetricConfig) metricGpuDcgmUtilization { + m := metricGpuDcgmUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmXidErrors struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.xid_errors metric with initial data. +func (m *metricGpuDcgmXidErrors) init() { + m.data.SetName("gpu.dcgm.xid_errors") + m.data.SetDescription("XID errors.") + m.data.SetUnit("1") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmXidErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, gpuErrorXidAttributeValue int64) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutInt("gpu.error.xid", gpuErrorXidAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmXidErrors) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmXidErrors) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmXidErrors(cfg MetricConfig) metricGpuDcgmXidErrors { + m := metricGpuDcgmXidErrors{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -533,14 +996,24 @@ type MetricsBuilder struct { metricsCapacity int // maximum observed number of metrics per resource. metricsBuffer pmetric.Metrics // accumulates metrics data before emitting. buildInfo component.BuildInfo // contains version information. - metricDcgmGpuMemoryBytesUsed metricDcgmGpuMemoryBytesUsed - metricDcgmGpuProfilingDramUtilization metricDcgmGpuProfilingDramUtilization - metricDcgmGpuProfilingNvlinkTrafficRate metricDcgmGpuProfilingNvlinkTrafficRate - metricDcgmGpuProfilingPcieTrafficRate metricDcgmGpuProfilingPcieTrafficRate - metricDcgmGpuProfilingPipeUtilization metricDcgmGpuProfilingPipeUtilization - metricDcgmGpuProfilingSmOccupancy metricDcgmGpuProfilingSmOccupancy - metricDcgmGpuProfilingSmUtilization metricDcgmGpuProfilingSmUtilization - metricDcgmGpuUtilization metricDcgmGpuUtilization + resourceAttributeIncludeFilter map[string]filter.Filter + resourceAttributeExcludeFilter map[string]filter.Filter + metricGpuDcgmClockFrequency metricGpuDcgmClockFrequency + metricGpuDcgmClockThrottleDurationTime metricGpuDcgmClockThrottleDurationTime + metricGpuDcgmCodecDecoderUtilization metricGpuDcgmCodecDecoderUtilization + metricGpuDcgmCodecEncoderUtilization metricGpuDcgmCodecEncoderUtilization + metricGpuDcgmEccErrors metricGpuDcgmEccErrors + metricGpuDcgmEnergyConsumption metricGpuDcgmEnergyConsumption + metricGpuDcgmMemoryBandwidthUtilization metricGpuDcgmMemoryBandwidthUtilization + metricGpuDcgmMemoryBytesUsed metricGpuDcgmMemoryBytesUsed + metricGpuDcgmNvlinkIo metricGpuDcgmNvlinkIo + metricGpuDcgmPcieIo metricGpuDcgmPcieIo + metricGpuDcgmPipeUtilization metricGpuDcgmPipeUtilization + metricGpuDcgmSmOccupancy metricGpuDcgmSmOccupancy + metricGpuDcgmSmUtilization metricGpuDcgmSmUtilization + metricGpuDcgmTemperature metricGpuDcgmTemperature + metricGpuDcgmUtilization metricGpuDcgmUtilization + metricGpuDcgmXidErrors metricGpuDcgmXidErrors } // metricBuilderOption applies changes to default metrics builder. @@ -559,14 +1032,42 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting startTime: pcommon.NewTimestampFromTime(time.Now()), metricsBuffer: pmetric.NewMetrics(), buildInfo: settings.BuildInfo, - metricDcgmGpuMemoryBytesUsed: newMetricDcgmGpuMemoryBytesUsed(mbc.Metrics.DcgmGpuMemoryBytesUsed), - metricDcgmGpuProfilingDramUtilization: newMetricDcgmGpuProfilingDramUtilization(mbc.Metrics.DcgmGpuProfilingDramUtilization), - metricDcgmGpuProfilingNvlinkTrafficRate: newMetricDcgmGpuProfilingNvlinkTrafficRate(mbc.Metrics.DcgmGpuProfilingNvlinkTrafficRate), - metricDcgmGpuProfilingPcieTrafficRate: newMetricDcgmGpuProfilingPcieTrafficRate(mbc.Metrics.DcgmGpuProfilingPcieTrafficRate), - metricDcgmGpuProfilingPipeUtilization: newMetricDcgmGpuProfilingPipeUtilization(mbc.Metrics.DcgmGpuProfilingPipeUtilization), - metricDcgmGpuProfilingSmOccupancy: newMetricDcgmGpuProfilingSmOccupancy(mbc.Metrics.DcgmGpuProfilingSmOccupancy), - metricDcgmGpuProfilingSmUtilization: newMetricDcgmGpuProfilingSmUtilization(mbc.Metrics.DcgmGpuProfilingSmUtilization), - metricDcgmGpuUtilization: newMetricDcgmGpuUtilization(mbc.Metrics.DcgmGpuUtilization), + metricGpuDcgmClockFrequency: newMetricGpuDcgmClockFrequency(mbc.Metrics.GpuDcgmClockFrequency), + metricGpuDcgmClockThrottleDurationTime: newMetricGpuDcgmClockThrottleDurationTime(mbc.Metrics.GpuDcgmClockThrottleDurationTime), + metricGpuDcgmCodecDecoderUtilization: newMetricGpuDcgmCodecDecoderUtilization(mbc.Metrics.GpuDcgmCodecDecoderUtilization), + metricGpuDcgmCodecEncoderUtilization: newMetricGpuDcgmCodecEncoderUtilization(mbc.Metrics.GpuDcgmCodecEncoderUtilization), + metricGpuDcgmEccErrors: newMetricGpuDcgmEccErrors(mbc.Metrics.GpuDcgmEccErrors), + metricGpuDcgmEnergyConsumption: newMetricGpuDcgmEnergyConsumption(mbc.Metrics.GpuDcgmEnergyConsumption), + metricGpuDcgmMemoryBandwidthUtilization: newMetricGpuDcgmMemoryBandwidthUtilization(mbc.Metrics.GpuDcgmMemoryBandwidthUtilization), + metricGpuDcgmMemoryBytesUsed: newMetricGpuDcgmMemoryBytesUsed(mbc.Metrics.GpuDcgmMemoryBytesUsed), + metricGpuDcgmNvlinkIo: newMetricGpuDcgmNvlinkIo(mbc.Metrics.GpuDcgmNvlinkIo), + metricGpuDcgmPcieIo: newMetricGpuDcgmPcieIo(mbc.Metrics.GpuDcgmPcieIo), + metricGpuDcgmPipeUtilization: newMetricGpuDcgmPipeUtilization(mbc.Metrics.GpuDcgmPipeUtilization), + metricGpuDcgmSmOccupancy: newMetricGpuDcgmSmOccupancy(mbc.Metrics.GpuDcgmSmOccupancy), + metricGpuDcgmSmUtilization: newMetricGpuDcgmSmUtilization(mbc.Metrics.GpuDcgmSmUtilization), + metricGpuDcgmTemperature: newMetricGpuDcgmTemperature(mbc.Metrics.GpuDcgmTemperature), + metricGpuDcgmUtilization: newMetricGpuDcgmUtilization(mbc.Metrics.GpuDcgmUtilization), + metricGpuDcgmXidErrors: newMetricGpuDcgmXidErrors(mbc.Metrics.GpuDcgmXidErrors), + resourceAttributeIncludeFilter: make(map[string]filter.Filter), + resourceAttributeExcludeFilter: make(map[string]filter.Filter), + } + if mbc.ResourceAttributes.GpuModel.MetricsInclude != nil { + mb.resourceAttributeIncludeFilter["gpu.model"] = filter.CreateFilter(mbc.ResourceAttributes.GpuModel.MetricsInclude) + } + if mbc.ResourceAttributes.GpuModel.MetricsExclude != nil { + mb.resourceAttributeExcludeFilter["gpu.model"] = filter.CreateFilter(mbc.ResourceAttributes.GpuModel.MetricsExclude) + } + if mbc.ResourceAttributes.GpuNumber.MetricsInclude != nil { + mb.resourceAttributeIncludeFilter["gpu.number"] = filter.CreateFilter(mbc.ResourceAttributes.GpuNumber.MetricsInclude) + } + if mbc.ResourceAttributes.GpuNumber.MetricsExclude != nil { + mb.resourceAttributeExcludeFilter["gpu.number"] = filter.CreateFilter(mbc.ResourceAttributes.GpuNumber.MetricsExclude) + } + if mbc.ResourceAttributes.GpuUUID.MetricsInclude != nil { + mb.resourceAttributeIncludeFilter["gpu.uuid"] = filter.CreateFilter(mbc.ResourceAttributes.GpuUUID.MetricsInclude) + } + if mbc.ResourceAttributes.GpuUUID.MetricsExclude != nil { + mb.resourceAttributeExcludeFilter["gpu.uuid"] = filter.CreateFilter(mbc.ResourceAttributes.GpuUUID.MetricsExclude) } for _, op := range options { @@ -575,6 +1076,11 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting return mb } +// NewResourceBuilder returns a new resource builder that should be used to build a resource associated with for the emitted metrics. +func (mb *MetricsBuilder) NewResourceBuilder() *ResourceBuilder { + return NewResourceBuilder(mb.config.ResourceAttributes) +} + // updateCapacity updates max length of metrics and resource attributes that will be used for the slice capacity. func (mb *MetricsBuilder) updateCapacity(rm pmetric.ResourceMetrics) { if mb.metricsCapacity < rm.ScopeMetrics().At(0).Metrics().Len() { @@ -624,18 +1130,36 @@ func (mb *MetricsBuilder) EmitForResource(rmo ...ResourceMetricsOption) { ils.Scope().SetName("github.com/GoogleCloudPlatform/opentelemetry-operations-collector/receiver/dcgmreceiver") ils.Scope().SetVersion(mb.buildInfo.Version) ils.Metrics().EnsureCapacity(mb.metricsCapacity) - mb.metricDcgmGpuMemoryBytesUsed.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingDramUtilization.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingNvlinkTrafficRate.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingPcieTrafficRate.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingPipeUtilization.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingSmOccupancy.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingSmUtilization.emit(ils.Metrics()) - mb.metricDcgmGpuUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmClockFrequency.emit(ils.Metrics()) + mb.metricGpuDcgmClockThrottleDurationTime.emit(ils.Metrics()) + mb.metricGpuDcgmCodecDecoderUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmCodecEncoderUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmEccErrors.emit(ils.Metrics()) + mb.metricGpuDcgmEnergyConsumption.emit(ils.Metrics()) + mb.metricGpuDcgmMemoryBandwidthUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmMemoryBytesUsed.emit(ils.Metrics()) + mb.metricGpuDcgmNvlinkIo.emit(ils.Metrics()) + mb.metricGpuDcgmPcieIo.emit(ils.Metrics()) + mb.metricGpuDcgmPipeUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmSmOccupancy.emit(ils.Metrics()) + mb.metricGpuDcgmSmUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmTemperature.emit(ils.Metrics()) + mb.metricGpuDcgmUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmXidErrors.emit(ils.Metrics()) for _, op := range rmo { op(rm) } + for attr, filter := range mb.resourceAttributeIncludeFilter { + if val, ok := rm.Resource().Attributes().Get(attr); ok && !filter.Matches(val.AsString()) { + return + } + } + for attr, filter := range mb.resourceAttributeExcludeFilter { + if val, ok := rm.Resource().Attributes().Get(attr); ok && filter.Matches(val.AsString()) { + return + } + } if ils.Metrics().Len() > 0 { mb.updateCapacity(rm) @@ -653,44 +1177,84 @@ func (mb *MetricsBuilder) Emit(rmo ...ResourceMetricsOption) pmetric.Metrics { return metrics } -// RecordDcgmGpuMemoryBytesUsedDataPoint adds a data point to dcgm.gpu.memory.bytes_used metric. -func (mb *MetricsBuilder) RecordDcgmGpuMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, memoryStateAttributeValue AttributeMemoryState) { - mb.metricDcgmGpuMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, memoryStateAttributeValue.String()) +// RecordGpuDcgmClockFrequencyDataPoint adds a data point to gpu.dcgm.clock.frequency metric. +func (mb *MetricsBuilder) RecordGpuDcgmClockFrequencyDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmClockFrequency.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmClockThrottleDurationTimeDataPoint adds a data point to gpu.dcgm.clock.throttle_duration.time metric. +func (mb *MetricsBuilder) RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts pcommon.Timestamp, val float64, gpuClockViolationAttributeValue AttributeGpuClockViolation) { + mb.metricGpuDcgmClockThrottleDurationTime.recordDataPoint(mb.startTime, ts, val, gpuClockViolationAttributeValue.String()) +} + +// RecordGpuDcgmCodecDecoderUtilizationDataPoint adds a data point to gpu.dcgm.codec.decoder.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmCodecDecoderUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmCodecDecoderUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmCodecEncoderUtilizationDataPoint adds a data point to gpu.dcgm.codec.encoder.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmCodecEncoderUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmCodecEncoderUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmEccErrorsDataPoint adds a data point to gpu.dcgm.ecc_errors metric. +func (mb *MetricsBuilder) RecordGpuDcgmEccErrorsDataPoint(ts pcommon.Timestamp, val int64, gpuErrorTypeAttributeValue AttributeGpuErrorType) { + mb.metricGpuDcgmEccErrors.recordDataPoint(mb.startTime, ts, val, gpuErrorTypeAttributeValue.String()) +} + +// RecordGpuDcgmEnergyConsumptionDataPoint adds a data point to gpu.dcgm.energy_consumption metric. +func (mb *MetricsBuilder) RecordGpuDcgmEnergyConsumptionDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmEnergyConsumption.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmMemoryBandwidthUtilizationDataPoint adds a data point to gpu.dcgm.memory.bandwidth_utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmMemoryBandwidthUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmMemoryBytesUsedDataPoint adds a data point to gpu.dcgm.memory.bytes_used metric. +func (mb *MetricsBuilder) RecordGpuDcgmMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, gpuMemoryStateAttributeValue AttributeGpuMemoryState) { + mb.metricGpuDcgmMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, gpuMemoryStateAttributeValue.String()) +} + +// RecordGpuDcgmNvlinkIoDataPoint adds a data point to gpu.dcgm.nvlink.io metric. +func (mb *MetricsBuilder) RecordGpuDcgmNvlinkIoDataPoint(ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue AttributeNetworkIoDirection) { + mb.metricGpuDcgmNvlinkIo.recordDataPoint(mb.startTime, ts, val, networkIoDirectionAttributeValue.String()) } -// RecordDcgmGpuProfilingDramUtilizationDataPoint adds a data point to dcgm.gpu.profiling.dram_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingDramUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuProfilingDramUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +// RecordGpuDcgmPcieIoDataPoint adds a data point to gpu.dcgm.pcie.io metric. +func (mb *MetricsBuilder) RecordGpuDcgmPcieIoDataPoint(ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue AttributeNetworkIoDirection) { + mb.metricGpuDcgmPcieIo.recordDataPoint(mb.startTime, ts, val, networkIoDirectionAttributeValue.String()) } -// RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint adds a data point to dcgm.gpu.profiling.nvlink_traffic_rate metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue AttributeDirection) { - mb.metricDcgmGpuProfilingNvlinkTrafficRate.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, directionAttributeValue.String()) +// RecordGpuDcgmPipeUtilizationDataPoint adds a data point to gpu.dcgm.pipe.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, gpuPipeAttributeValue AttributeGpuPipe) { + mb.metricGpuDcgmPipeUtilization.recordDataPoint(mb.startTime, ts, val, gpuPipeAttributeValue.String()) } -// RecordDcgmGpuProfilingPcieTrafficRateDataPoint adds a data point to dcgm.gpu.profiling.pcie_traffic_rate metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue AttributeDirection) { - mb.metricDcgmGpuProfilingPcieTrafficRate.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, directionAttributeValue.String()) +// RecordGpuDcgmSmOccupancyDataPoint adds a data point to gpu.dcgm.sm.occupancy metric. +func (mb *MetricsBuilder) RecordGpuDcgmSmOccupancyDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmSmOccupancy.recordDataPoint(mb.startTime, ts, val) } -// RecordDcgmGpuProfilingPipeUtilizationDataPoint adds a data point to dcgm.gpu.profiling.pipe_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, pipeAttributeValue AttributePipe) { - mb.metricDcgmGpuProfilingPipeUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, pipeAttributeValue.String()) +// RecordGpuDcgmSmUtilizationDataPoint adds a data point to gpu.dcgm.sm.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmSmUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmSmUtilization.recordDataPoint(mb.startTime, ts, val) } -// RecordDcgmGpuProfilingSmOccupancyDataPoint adds a data point to dcgm.gpu.profiling.sm_occupancy metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmOccupancyDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuProfilingSmOccupancy.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +// RecordGpuDcgmTemperatureDataPoint adds a data point to gpu.dcgm.temperature metric. +func (mb *MetricsBuilder) RecordGpuDcgmTemperatureDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmTemperature.recordDataPoint(mb.startTime, ts, val) } -// RecordDcgmGpuProfilingSmUtilizationDataPoint adds a data point to dcgm.gpu.profiling.sm_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuProfilingSmUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +// RecordGpuDcgmUtilizationDataPoint adds a data point to gpu.dcgm.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmUtilization.recordDataPoint(mb.startTime, ts, val) } -// RecordDcgmGpuUtilizationDataPoint adds a data point to dcgm.gpu.utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +// RecordGpuDcgmXidErrorsDataPoint adds a data point to gpu.dcgm.xid_errors metric. +func (mb *MetricsBuilder) RecordGpuDcgmXidErrorsDataPoint(ts pcommon.Timestamp, val int64, gpuErrorXidAttributeValue int64) { + mb.metricGpuDcgmXidErrors.recordDataPoint(mb.startTime, ts, val, gpuErrorXidAttributeValue) } // Reset resets metrics builder to its initial state. It should be used when external metrics source is restarted, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index 0b68d49bd..e4ba17cd5 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -42,6 +42,15 @@ func TestMetricsBuilder(t *testing.T) { resAttrsSet: testDataSetNone, expectEmpty: true, }, + { + name: "filter_set_include", + resAttrsSet: testDataSetAll, + }, + { + name: "filter_set_exclude", + resAttrsSet: testDataSetAll, + expectEmpty: true, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -61,37 +70,71 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuMemoryBytesUsedDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributeMemoryStateUsed) + mb.RecordGpuDcgmClockFrequencyDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts, 1, AttributeGpuClockViolationPower) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributeDirectionTx) + mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributeDirectionTx) + mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributePipeTensor) + mb.RecordGpuDcgmEccErrorsDataPoint(ts, 1, AttributeGpuErrorTypeSbe) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmEnergyConsumptionDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmMemoryBytesUsedDataPoint(ts, 1, AttributeGpuMemoryStateUsed) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmNvlinkIoDataPoint(ts, 1, AttributeNetworkIoDirectionTransmit) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmPcieIoDataPoint(ts, 1, AttributeNetworkIoDirectionTransmit) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmPipeUtilizationDataPoint(ts, 1, AttributeGpuPipeTensor) + + allMetricsCount++ + mb.RecordGpuDcgmSmOccupancyDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordGpuDcgmSmUtilizationDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordGpuDcgmTemperatureDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordGpuDcgmUtilizationDataPoint(ts, 1) - res := pcommon.NewResource() + allMetricsCount++ + mb.RecordGpuDcgmXidErrorsDataPoint(ts, 1, 13) + + rb := mb.NewResourceBuilder() + rb.SetGpuModel("gpu.model-val") + rb.SetGpuNumber("gpu.number-val") + rb.SetGpuUUID("gpu.uuid-val") + res := rb.Emit() metrics := mb.Emit(WithResource(res)) if test.expectEmpty { @@ -113,102 +156,154 @@ func TestMetricsBuilder(t *testing.T) { validatedMetrics := make(map[string]bool) for i := 0; i < ms.Len(); i++ { switch ms.At(i).Name() { - case "dcgm.gpu.memory.bytes_used": - assert.False(t, validatedMetrics["dcgm.gpu.memory.bytes_used"], "Found a duplicate in the metrics slice: dcgm.gpu.memory.bytes_used") - validatedMetrics["dcgm.gpu.memory.bytes_used"] = true + case "gpu.dcgm.clock.frequency": + assert.False(t, validatedMetrics["gpu.dcgm.clock.frequency"], "Found a duplicate in the metrics slice: gpu.dcgm.clock.frequency") + validatedMetrics["gpu.dcgm.clock.frequency"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.", ms.At(i).Description()) - assert.Equal(t, "By", ms.At(i).Unit()) + assert.Equal(t, "Multiprocessor clock frequency.", ms.At(i).Description()) + assert.Equal(t, "Hz", ms.At(i).Unit()) dp := ms.At(i).Gauge().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) - assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("memory_state") + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.clock.throttle_duration.time": + assert.False(t, validatedMetrics["gpu.dcgm.clock.throttle_duration.time"], "Found a duplicate in the metrics slice: gpu.dcgm.clock.throttle_duration.time") + validatedMetrics["gpu.dcgm.clock.throttle_duration.time"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Clock throttle total duration.", ms.At(i).Description()) + assert.Equal(t, "s", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + attrVal, ok := dp.Attributes().Get("gpu.clock.violation") assert.True(t, ok) - assert.EqualValues(t, "used", attrVal.Str()) - case "dcgm.gpu.profiling.dram_utilization": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.dram_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.dram_utilization") - validatedMetrics["dcgm.gpu.profiling.dram_utilization"] = true + assert.EqualValues(t, "power", attrVal.Str()) + case "gpu.dcgm.codec.decoder.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.codec.decoder.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.codec.decoder.utilization") + validatedMetrics["gpu.dcgm.codec.decoder.utilization"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of cycles data was being sent or received from GPU memory.", ms.At(i).Description()) + assert.Equal(t, "Decoder utilization.", ms.At(i).Description()) assert.Equal(t, "1", ms.At(i).Unit()) dp := ms.At(i).Gauge().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - case "dcgm.gpu.profiling.nvlink_traffic_rate": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.nvlink_traffic_rate"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.nvlink_traffic_rate") - validatedMetrics["dcgm.gpu.profiling.nvlink_traffic_rate"] = true + case "gpu.dcgm.codec.encoder.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.codec.encoder.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.codec.encoder.utilization") + validatedMetrics["gpu.dcgm.codec.encoder.utilization"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers.", ms.At(i).Description()) - assert.Equal(t, "By/s", ms.At(i).Unit()) + assert.Equal(t, "Encoder utilization.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) dp := ms.At(i).Gauge().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.ecc_errors": + assert.False(t, validatedMetrics["gpu.dcgm.ecc_errors"], "Found a duplicate in the metrics slice: gpu.dcgm.ecc_errors") + validatedMetrics["gpu.dcgm.ecc_errors"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Data corruption errors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("model") + attrVal, ok := dp.Attributes().Get("gpu.error.type") assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("direction") - assert.True(t, ok) - assert.EqualValues(t, "tx", attrVal.Str()) - case "dcgm.gpu.profiling.pcie_traffic_rate": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.pcie_traffic_rate"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.pcie_traffic_rate") - validatedMetrics["dcgm.gpu.profiling.pcie_traffic_rate"] = true + assert.EqualValues(t, "sbe", attrVal.Str()) + case "gpu.dcgm.energy_consumption": + assert.False(t, validatedMetrics["gpu.dcgm.energy_consumption"], "Found a duplicate in the metrics slice: gpu.dcgm.energy_consumption") + validatedMetrics["gpu.dcgm.energy_consumption"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Total energy consumption for the GPU in J since the driver was last reloaded.", ms.At(i).Description()) + assert.Equal(t, "J", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.memory.bandwidth_utilization": + assert.False(t, validatedMetrics["gpu.dcgm.memory.bandwidth_utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.memory.bandwidth_utilization") + validatedMetrics["gpu.dcgm.memory.bandwidth_utilization"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads.", ms.At(i).Description()) - assert.Equal(t, "By/s", ms.At(i).Unit()) + assert.Equal(t, "Fraction of cycles data was being sent or received from GPU memory.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.memory.bytes_used": + assert.False(t, validatedMetrics["gpu.dcgm.memory.bytes_used"], "Found a duplicate in the metrics slice: gpu.dcgm.memory.bytes_used") + validatedMetrics["gpu.dcgm.memory.bytes_used"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.", ms.At(i).Description()) + assert.Equal(t, "By", ms.At(i).Unit()) dp := ms.At(i).Gauge().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("model") + attrVal, ok := dp.Attributes().Get("gpu.memory.state") assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") + assert.EqualValues(t, "used", attrVal.Str()) + case "gpu.dcgm.nvlink.io": + assert.False(t, validatedMetrics["gpu.dcgm.nvlink.io"], "Found a duplicate in the metrics slice: gpu.dcgm.nvlink.io") + validatedMetrics["gpu.dcgm.nvlink.io"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "The number of bytes sent over NVLink, not including protocol headers.", ms.At(i).Description()) + assert.Equal(t, "By", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("network.io.direction") assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("direction") + assert.EqualValues(t, "transmit", attrVal.Str()) + case "gpu.dcgm.pcie.io": + assert.False(t, validatedMetrics["gpu.dcgm.pcie.io"], "Found a duplicate in the metrics slice: gpu.dcgm.pcie.io") + validatedMetrics["gpu.dcgm.pcie.io"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.", ms.At(i).Description()) + assert.Equal(t, "By", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("network.io.direction") assert.True(t, ok) - assert.EqualValues(t, "tx", attrVal.Str()) - case "dcgm.gpu.profiling.pipe_utilization": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.pipe_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.pipe_utilization") - validatedMetrics["dcgm.gpu.profiling.pipe_utilization"] = true + assert.EqualValues(t, "transmit", attrVal.Str()) + case "gpu.dcgm.pipe.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.pipe.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.pipe.utilization") + validatedMetrics["gpu.dcgm.pipe.utilization"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) assert.Equal(t, "Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.", ms.At(i).Description()) @@ -218,42 +313,24 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("pipe") + attrVal, ok := dp.Attributes().Get("gpu.pipe") assert.True(t, ok) assert.EqualValues(t, "tensor", attrVal.Str()) - case "dcgm.gpu.profiling.sm_occupancy": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.sm_occupancy"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.sm_occupancy") - validatedMetrics["dcgm.gpu.profiling.sm_occupancy"] = true + case "gpu.dcgm.sm.occupancy": + assert.False(t, validatedMetrics["gpu.dcgm.sm.occupancy"], "Found a duplicate in the metrics slice: gpu.dcgm.sm.occupancy") + validatedMetrics["gpu.dcgm.sm.occupancy"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.", ms.At(i).Description()) + assert.Equal(t, "Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors.", ms.At(i).Description()) assert.Equal(t, "1", ms.At(i).Unit()) dp := ms.At(i).Gauge().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - case "dcgm.gpu.profiling.sm_utilization": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.sm_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.sm_utilization") - validatedMetrics["dcgm.gpu.profiling.sm_utilization"] = true + case "gpu.dcgm.sm.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.sm.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.sm.utilization") + validatedMetrics["gpu.dcgm.sm.utilization"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) assert.Equal(t, "Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.", ms.At(i).Description()) @@ -263,36 +340,47 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - case "dcgm.gpu.utilization": - assert.False(t, validatedMetrics["dcgm.gpu.utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.utilization") - validatedMetrics["dcgm.gpu.utilization"] = true + case "gpu.dcgm.temperature": + assert.False(t, validatedMetrics["gpu.dcgm.temperature"], "Found a duplicate in the metrics slice: gpu.dcgm.temperature") + validatedMetrics["gpu.dcgm.temperature"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Current temperature readings for the device, in ˚C.", ms.At(i).Description()) + assert.Equal(t, "Cel", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.utilization") + validatedMetrics["gpu.dcgm.utilization"] = true assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of time the GPU was not idle.", ms.At(i).Description()) + assert.Equal(t, "Ratio of time the graphics engine is active.", ms.At(i).Description()) assert.Equal(t, "1", ms.At(i).Unit()) dp := ms.At(i).Gauge().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") + case "gpu.dcgm.xid_errors": + assert.False(t, validatedMetrics["gpu.dcgm.xid_errors"], "Found a duplicate in the metrics slice: gpu.dcgm.xid_errors") + validatedMetrics["gpu.dcgm.xid_errors"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "XID errors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("gpu.error.xid") assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) + assert.EqualValues(t, 13, attrVal.Int()) } } }) diff --git a/receiver/dcgmreceiver/internal/metadata/generated_resource.go b/receiver/dcgmreceiver/internal/metadata/generated_resource.go new file mode 100644 index 000000000..3b9be9a4d --- /dev/null +++ b/receiver/dcgmreceiver/internal/metadata/generated_resource.go @@ -0,0 +1,50 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadata + +import ( + "go.opentelemetry.io/collector/pdata/pcommon" +) + +// ResourceBuilder is a helper struct to build resources predefined in metadata.yaml. +// The ResourceBuilder is not thread-safe and must not to be used in multiple goroutines. +type ResourceBuilder struct { + config ResourceAttributesConfig + res pcommon.Resource +} + +// NewResourceBuilder creates a new ResourceBuilder. This method should be called on the start of the application. +func NewResourceBuilder(rac ResourceAttributesConfig) *ResourceBuilder { + return &ResourceBuilder{ + config: rac, + res: pcommon.NewResource(), + } +} + +// SetGpuModel sets provided value as "gpu.model" attribute. +func (rb *ResourceBuilder) SetGpuModel(val string) { + if rb.config.GpuModel.Enabled { + rb.res.Attributes().PutStr("gpu.model", val) + } +} + +// SetGpuNumber sets provided value as "gpu.number" attribute. +func (rb *ResourceBuilder) SetGpuNumber(val string) { + if rb.config.GpuNumber.Enabled { + rb.res.Attributes().PutStr("gpu.number", val) + } +} + +// SetGpuUUID sets provided value as "gpu.uuid" attribute. +func (rb *ResourceBuilder) SetGpuUUID(val string) { + if rb.config.GpuUUID.Enabled { + rb.res.Attributes().PutStr("gpu.uuid", val) + } +} + +// Emit returns the built resource and resets the internal builder state. +func (rb *ResourceBuilder) Emit() pcommon.Resource { + r := rb.res + rb.res = pcommon.NewResource() + return r +} diff --git a/receiver/dcgmreceiver/internal/metadata/generated_resource_test.go b/receiver/dcgmreceiver/internal/metadata/generated_resource_test.go new file mode 100644 index 000000000..eeaa832fc --- /dev/null +++ b/receiver/dcgmreceiver/internal/metadata/generated_resource_test.go @@ -0,0 +1,52 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadata + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestResourceBuilder(t *testing.T) { + for _, test := range []string{"default", "all_set", "none_set"} { + t.Run(test, func(t *testing.T) { + cfg := loadResourceAttributesConfig(t, test) + rb := NewResourceBuilder(cfg) + rb.SetGpuModel("gpu.model-val") + rb.SetGpuNumber("gpu.number-val") + rb.SetGpuUUID("gpu.uuid-val") + + res := rb.Emit() + assert.Equal(t, 0, rb.Emit().Attributes().Len()) // Second call should return empty Resource + + switch test { + case "default": + assert.Equal(t, 3, res.Attributes().Len()) + case "all_set": + assert.Equal(t, 3, res.Attributes().Len()) + case "none_set": + assert.Equal(t, 0, res.Attributes().Len()) + return + default: + assert.Failf(t, "unexpected test case: %s", test) + } + + val, ok := res.Attributes().Get("gpu.model") + assert.True(t, ok) + if ok { + assert.EqualValues(t, "gpu.model-val", val.Str()) + } + val, ok = res.Attributes().Get("gpu.number") + assert.True(t, ok) + if ok { + assert.EqualValues(t, "gpu.number-val", val.Str()) + } + val, ok = res.Attributes().Get("gpu.uuid") + assert.True(t, ok) + if ok { + assert.EqualValues(t, "gpu.uuid-val", val.Str()) + } + }) + } +} diff --git a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml index 20fbd34ce..2047c57a8 100644 --- a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml +++ b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml @@ -1,37 +1,111 @@ default: all_set: metrics: - dcgm.gpu.memory.bytes_used: + gpu.dcgm.clock.frequency: enabled: true - dcgm.gpu.profiling.dram_utilization: + gpu.dcgm.clock.throttle_duration.time: enabled: true - dcgm.gpu.profiling.nvlink_traffic_rate: + gpu.dcgm.codec.decoder.utilization: enabled: true - dcgm.gpu.profiling.pcie_traffic_rate: + gpu.dcgm.codec.encoder.utilization: enabled: true - dcgm.gpu.profiling.pipe_utilization: + gpu.dcgm.ecc_errors: enabled: true - dcgm.gpu.profiling.sm_occupancy: + gpu.dcgm.energy_consumption: enabled: true - dcgm.gpu.profiling.sm_utilization: + gpu.dcgm.memory.bandwidth_utilization: enabled: true - dcgm.gpu.utilization: + gpu.dcgm.memory.bytes_used: + enabled: true + gpu.dcgm.nvlink.io: + enabled: true + gpu.dcgm.pcie.io: + enabled: true + gpu.dcgm.pipe.utilization: + enabled: true + gpu.dcgm.sm.occupancy: + enabled: true + gpu.dcgm.sm.utilization: + enabled: true + gpu.dcgm.temperature: + enabled: true + gpu.dcgm.utilization: + enabled: true + gpu.dcgm.xid_errors: + enabled: true + resource_attributes: + gpu.model: + enabled: true + gpu.number: + enabled: true + gpu.uuid: enabled: true none_set: metrics: - dcgm.gpu.memory.bytes_used: + gpu.dcgm.clock.frequency: + enabled: false + gpu.dcgm.clock.throttle_duration.time: + enabled: false + gpu.dcgm.codec.decoder.utilization: + enabled: false + gpu.dcgm.codec.encoder.utilization: enabled: false - dcgm.gpu.profiling.dram_utilization: + gpu.dcgm.ecc_errors: enabled: false - dcgm.gpu.profiling.nvlink_traffic_rate: + gpu.dcgm.energy_consumption: enabled: false - dcgm.gpu.profiling.pcie_traffic_rate: + gpu.dcgm.memory.bandwidth_utilization: enabled: false - dcgm.gpu.profiling.pipe_utilization: + gpu.dcgm.memory.bytes_used: enabled: false - dcgm.gpu.profiling.sm_occupancy: + gpu.dcgm.nvlink.io: enabled: false - dcgm.gpu.profiling.sm_utilization: + gpu.dcgm.pcie.io: enabled: false - dcgm.gpu.utilization: + gpu.dcgm.pipe.utilization: enabled: false + gpu.dcgm.sm.occupancy: + enabled: false + gpu.dcgm.sm.utilization: + enabled: false + gpu.dcgm.temperature: + enabled: false + gpu.dcgm.utilization: + enabled: false + gpu.dcgm.xid_errors: + enabled: false + resource_attributes: + gpu.model: + enabled: false + gpu.number: + enabled: false + gpu.uuid: + enabled: false +filter_set_include: + resource_attributes: + gpu.model: + enabled: true + metrics_include: + - regexp: ".*" + gpu.number: + enabled: true + metrics_include: + - regexp: ".*" + gpu.uuid: + enabled: true + metrics_include: + - regexp: ".*" +filter_set_exclude: + resource_attributes: + gpu.model: + enabled: true + metrics_exclude: + - strict: "gpu.model-val" + gpu.number: + enabled: true + metrics_exclude: + - strict: "gpu.number-val" + gpu.uuid: + enabled: true + metrics_exclude: + - strict: "gpu.uuid-val" diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 60999eff8..6201aeb00 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -5,95 +5,180 @@ status: beta: [metrics] -attributes: - model: +resource_attributes: + gpu.number: type: string - description: GPU model + description: GPU index starting at 0. + enabled: true - uuid: + gpu.uuid: type: string - description: GPU universally unique identifier + description: GPU universally unique identifier. + enabled: true - gpu_number: + gpu.model: type: string - description: GPU index starting at 0. + description: GPU model name. + enabled: true - memory_state: +attributes: + gpu.memory.state: type: string - description: GPU memory used or free - enum: [used, free] + description: GPU memory state, one of [free, used, reserved]. + enum: [used, free, reserved] - pipe: + gpu.pipe: type: string description: GPU pipe in use, one of [tensor, fp64, fp32, fp16]. enum: [tensor, fp64, fp32, fp16] - direction: + network.io.direction: + type: string + description: Direction of the link traffic, one of [transmit, receive]. + enum: [transmit, receive] + + gpu.clock.violation: + type: string + description: Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. + enum: [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock] + + gpu.error.type: type: string - description: Direction of the link traffic, one of [tx, rx]. - enum: [tx, rx] + description: The type of error, one of [sbe, dbe]. + enum: [sbe, dbe] + + gpu.error.xid: + type: int + description: The XID code for the error, 1..143. metrics: - dcgm.gpu.utilization: - enabled: true - description: Fraction of time the GPU was not idle. + gpu.dcgm.utilization: + description: Ratio of time the graphics engine is active. unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] - - dcgm.gpu.memory.bytes_used: enabled: true - description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. - unit: By - gauge: - value_type: int - attributes: [model, gpu_number, uuid, memory_state] - dcgm.gpu.profiling.sm_utilization: - enabled: true + gpu.dcgm.sm.utilization: description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] - - dcgm.gpu.profiling.sm_occupancy: enabled: true - description: Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors. + + gpu.dcgm.sm.occupancy: + description: Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] + enabled: false - dcgm.gpu.profiling.pipe_utilization: - enabled: true + gpu.dcgm.pipe.utilization: description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid, pipe] + attributes: [gpu.pipe] + enabled: true - dcgm.gpu.profiling.dram_utilization: + gpu.dcgm.codec.encoder.utilization: + description: Encoder utilization. + unit: 1 + gauge: + value_type: double enabled: true - description: Fraction of cycles data was being sent or received from GPU memory. + + gpu.dcgm.codec.decoder.utilization: + description: Decoder utilization. unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] + enabled: true - dcgm.gpu.profiling.pcie_traffic_rate: + gpu.dcgm.memory.bytes_used: + description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. + unit: By + gauge: + value_type: int + attributes: [gpu.memory.state] enabled: true - description: The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads. - unit: By/s + + gpu.dcgm.memory.bandwidth_utilization: + description: Fraction of cycles data was being sent or received from GPU memory. + unit: 1 gauge: + value_type: double + enabled: true + + gpu.dcgm.pcie.io: + description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. + unit: By + sum: + value_type: int + aggregation_temporality: cumulative + monotonic: true + attributes: [network.io.direction] + enabled: true + + gpu.dcgm.nvlink.io: + description: The number of bytes sent over NVLink, not including protocol headers. + unit: By + sum: value_type: int - attributes: [model, gpu_number, uuid, direction] + aggregation_temporality: cumulative + monotonic: true + attributes: [network.io.direction] + enabled: true - dcgm.gpu.profiling.nvlink_traffic_rate: + gpu.dcgm.energy_consumption: + description: Total energy consumption for the GPU in J since the driver was last reloaded. + unit: J + sum: + value_type: double + aggregation_temporality: cumulative + monotonic: true enabled: true - description: The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers. - unit: By/s + + gpu.dcgm.temperature: + description: Current temperature readings for the device, in ˚C. + unit: Cel gauge: + value_type: double + enabled: true + + gpu.dcgm.clock.frequency: + description: Multiprocessor clock frequency. + unit: Hz + gauge: + value_type: double + enabled: true + + gpu.dcgm.clock.throttle_duration.time: + description: Clock throttle total duration. + unit: s + sum: + value_type: double + aggregation_temporality: cumulative + monotonic: true + attributes: [gpu.clock.violation] + enabled: true + + gpu.dcgm.ecc_errors: + description: Data corruption errors. + unit: 1 + sum: + value_type: int + aggregation_temporality: cumulative + monotonic: true + attributes: [gpu.error.type] + enabled: true + + gpu.dcgm.xid_errors: + description: XID errors. + unit: 1 + sum: value_type: int - attributes: [model, gpu_number, uuid, direction] + aggregation_temporality: cumulative + monotonic: true + attributes: [gpu.error.xid] + enabled: false diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 2768e50d9..f84b17318 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -27,108 +27,346 @@ import ( "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver" + "golang.org/x/sync/errgroup" "github.com/GoogleCloudPlatform/opentelemetry-operations-collector/receiver/dcgmreceiver/internal/metadata" ) type dcgmScraper struct { - config *Config - settings receiver.CreateSettings - client *dcgmClient - mb *metadata.MetricsBuilder + config *Config + settings receiver.CreateSettings + initRetryDelay time.Duration + mb *metadata.MetricsBuilder + collectTriggerCh chan<- struct{} + metricsCh <-chan map[uint]deviceMetrics + cancel func() } func newDcgmScraper(config *Config, settings receiver.CreateSettings) *dcgmScraper { - return &dcgmScraper{config: config, settings: settings} + return &dcgmScraper{config: config, settings: settings, initRetryDelay: 10 * time.Second} } -// initClient will try to create a new dcgmClient if currently has no client; -// it will try to initialize the communication with the DCGM service; if +const scrapePollingInterval = 100 * time.Millisecond // TODO: Choose an appropriate value + +// initClient will try to initialize the communication with the DCGM service; if // success, create a client; only return errors if DCGM service is available but // failed to create client. -func (s *dcgmScraper) initClient() error { - if s.client != nil { - return nil +func (s *dcgmScraper) initClient() (*dcgmClient, error) { + clientSettings := &dcgmClientSettings{ + endpoint: s.config.TCPAddrConfig.Endpoint, + pollingInterval: scrapePollingInterval, + fields: discoverRequestedFields(s.config), + retryBlankValues: true, + maxRetries: 5, } - client, err := newClient(s.config, s.settings.Logger) + client, err := newClient(clientSettings, s.settings.Logger) if err != nil { s.settings.Logger.Sugar().Warn(err) if errors.Is(err, ErrDcgmInitialization) { // If cannot connect to DCGM, return no error and retry at next // collection time - return nil + return nil, nil } - return err + return nil, err } - s.client = client - return nil + return client, nil } -func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { +func (s *dcgmScraper) start(ctx context.Context, _ component.Host) error { startTime := pcommon.NewTimestampFromTime(time.Now()) mbConfig := metadata.DefaultMetricsBuilderConfig() mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) + scrapeCtx, scrapeCancel := context.WithCancel(context.WithoutCancel(ctx)) + g, scrapeCtx := errgroup.WithContext(scrapeCtx) + + s.cancel = func() { + scrapeCancel() + _ = g.Wait() // Ignore the error from a canceled context + } + + metricsCh := make(chan map[uint]deviceMetrics) + collectTriggerCh := make(chan struct{}, 1) // Capacity of 1 makes this asynchronous + s.metricsCh = metricsCh + s.collectTriggerCh = collectTriggerCh + + g.Go(func() error { + return s.runConnectLoop(scrapeCtx, metricsCh, collectTriggerCh) + }) + return nil } func (s *dcgmScraper) stop(_ context.Context) error { - if s.client != nil { - s.client.cleanup() + if s.cancel != nil { + s.cancel() + s.cancel = nil } return nil } -func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { - err := s.initClient() - if err != nil || s.client == nil { - return s.mb.Emit(), err +func discoverRequestedFields(config *Config) []string { + requestedFields := []string{} + if config.Metrics.GpuDcgmUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_GR_ENGINE_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_DEV_GPU_UTIL") // fallback + } + if config.Metrics.GpuDcgmSmUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_SM_ACTIVE") + } + if config.Metrics.GpuDcgmSmOccupancy.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_SM_OCCUPANCY") + } + if config.Metrics.GpuDcgmPipeUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_FP64_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_FP32_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_FP16_ACTIVE") + } + if config.Metrics.GpuDcgmCodecEncoderUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_ENC_UTIL") + } + if config.Metrics.GpuDcgmCodecDecoderUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_DEC_UTIL") + } + if config.Metrics.GpuDcgmMemoryBytesUsed.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_FB_FREE") + requestedFields = append(requestedFields, "DCGM_FI_DEV_FB_USED") + requestedFields = append(requestedFields, "DCGM_FI_DEV_FB_RESERVED") + } + if config.Metrics.GpuDcgmMemoryBandwidthUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_DRAM_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_DEV_MEM_COPY_UTIL") // fallback + } + if config.Metrics.GpuDcgmPcieIo.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_PCIE_TX_BYTES") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PCIE_RX_BYTES") + } + if config.Metrics.GpuDcgmNvlinkIo.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_NVLINK_TX_BYTES") + requestedFields = append(requestedFields, "DCGM_FI_PROF_NVLINK_RX_BYTES") + } + if config.Metrics.GpuDcgmEnergyConsumption.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_POWER_USAGE") // fallback + } + if config.Metrics.GpuDcgmTemperature.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_GPU_TEMP") + } + if config.Metrics.GpuDcgmClockFrequency.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_SM_CLOCK") + } + if config.Metrics.GpuDcgmClockThrottleDurationTime.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_POWER_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_THERMAL_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_SYNC_BOOST_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_LOW_UTIL_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_RELIABILITY_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION") + } + if config.Metrics.GpuDcgmEccErrors.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL") + requestedFields = append(requestedFields, "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL") + } + if config.Metrics.GpuDcgmXidErrors.Enabled { + // requestedFields = append(requestedFields, "") + func() {}() // no-op + } + + return requestedFields +} + +func (s *dcgmScraper) runConnectLoop(ctx context.Context, metricsCh chan<- map[uint]deviceMetrics, collectTriggerCh <-chan struct{}) error { + defer close(metricsCh) + for { + client, _ := s.initClient() + // Ignore the error; it's logged in initClient. + if client != nil { + s.pollClient(ctx, client, metricsCh, collectTriggerCh) + } + select { + case <-ctx.Done(): + return ctx.Err() + case metricsCh <- map[uint]deviceMetrics{}: + // Un-hang any scrapers waiting for data, since we currently have no metrics to offer. + case <-time.After(s.initRetryDelay): + } + } +} + +func (s *dcgmScraper) pollClient(ctx context.Context, client *dcgmClient, metricsCh chan<- map[uint]deviceMetrics, collectTriggerCh <-chan struct{}) { + defer client.cleanup() + for { + waitTime, err := client.collect() + // Ignore the error; it's logged in collect() + if err != nil { + waitTime = 10 * time.Second + } + // Try to poll at least twice per collection interval + waitTime = max( + 100*time.Millisecond, + min( + s.config.CollectionInterval, + waitTime, + )/2, + ) + s.settings.Logger.Sugar().Debugf("Waiting %s for the next collection", waitTime) + after := time.After(waitTime) + for after != nil { + deviceMetrics := client.getDeviceMetrics() + select { + case <-ctx.Done(): + return + case <-collectTriggerCh: + // Loop and trigger a collect() again. + after = nil + case metricsCh <- deviceMetrics: + case <-after: + after = nil + } + } } +} - deviceMetrics, err := s.client.collectDeviceMetrics() +func (s *dcgmScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { + var deviceMetrics map[uint]deviceMetrics + // Trigger a collection cycle to make sure we have fresh metrics. + // The select ensures that if there's already a request registered we don't block. + select { + case s.collectTriggerCh <- struct{}{}: + default: + } + // Now wait for metrics. + select { + case deviceMetrics = <-s.metricsCh: + case <-ctx.Done(): + return pmetric.NewMetrics(), ctx.Err() + } + s.settings.Logger.Sugar().Debugf("Metrics collected: %d", len(deviceMetrics)) now := pcommon.NewTimestampFromTime(time.Now()) - for _, metric := range deviceMetrics { - model := s.client.getDeviceModelName(metric.gpuIndex) - UUID := s.client.getDeviceUUID(metric.gpuIndex) - gpuIndex := fmt.Sprintf("%d", metric.gpuIndex) - switch metric.name { - case "dcgm.gpu.utilization": - gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordDcgmGpuUtilizationDataPoint(now, gpuUtil, model, gpuIndex, UUID) - case "dcgm.gpu.memory.bytes_used": - bytesUsed := 1e6 * metric.asInt64() /* MB to B */ - s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesUsed, model, gpuIndex, UUID, metadata.AttributeMemoryStateUsed) - case "dcgm.gpu.memory.bytes_free": - bytesFree := 1e6 * metric.asInt64() /* MB to B */ - s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesFree, model, gpuIndex, UUID, metadata.AttributeMemoryStateFree) - case "dcgm.gpu.profiling.sm_utilization": - s.mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID) - case "dcgm.gpu.profiling.sm_occupancy": - s.mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID) - case "dcgm.gpu.profiling.tensor_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeTensor) - case "dcgm.gpu.profiling.fp64_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeFp64) - case "dcgm.gpu.profiling.fp32_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeFp32) - case "dcgm.gpu.profiling.fp16_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeFp16) - case "dcgm.gpu.profiling.dram_utilization": - s.mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID) - case "dcgm.gpu.profiling.pcie_sent_bytes": - /* DCGM already returns these as bytes/sec despite the name */ - s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionTx) - case "dcgm.gpu.profiling.pcie_received_bytes": - s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionRx) - case "dcgm.gpu.profiling.nvlink_sent_bytes": - s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionTx) - case "dcgm.gpu.profiling.nvlink_received_bytes": - s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionRx) - } - } - - return s.mb.Emit(), err + for gpuIndex, gpu := range deviceMetrics { + s.settings.Logger.Sugar().Debugf("Got %d unique metrics: %v", len(gpu.Metrics), gpu.Metrics) + rb := s.mb.NewResourceBuilder() + rb.SetGpuNumber(fmt.Sprintf("%d", gpuIndex)) + rb.SetGpuUUID(gpu.UUID) + rb.SetGpuModel(gpu.ModelName) + gpuResource := rb.Emit() + + v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_GR_ENGINE_ACTIVE") + if !ok { + v, ok = gpu.Metrics.LastFloat64("DCGM_FI_DEV_GPU_UTIL") + v /= 100.0 /* normalize */ + } + if ok { + s.mb.RecordGpuDcgmUtilizationDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_SM_ACTIVE"); ok { + s.mb.RecordGpuDcgmSmUtilizationDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_SM_OCCUPANCY"); ok { + s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeTensor) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_FP64_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeFp64) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_FP32_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeFp32) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_FP16_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeFp16) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_ENC_UTIL"); ok { + s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, v/100.0) /* normalize */ + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_DEC_UTIL"); ok { + s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, v/100.0) /* normalize */ + } + if v, ok := gpu.Metrics.LastInt64("DCGM_FI_DEV_FB_FREE"); ok { + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, 1e6*v, metadata.AttributeGpuMemoryStateFree) /* MBy to By */ + } + if v, ok := gpu.Metrics.LastInt64("DCGM_FI_DEV_FB_USED"); ok { + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, 1e6*v, metadata.AttributeGpuMemoryStateUsed) /* MBy to By */ + } + if v, ok := gpu.Metrics.LastInt64("DCGM_FI_DEV_FB_RESERVED"); ok { + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, 1e6*v, metadata.AttributeGpuMemoryStateReserved) /* MBy to By */ + } + v, ok = gpu.Metrics.LastFloat64("DCGM_FI_PROF_DRAM_ACTIVE") + if !ok { // fallback + v, ok = gpu.Metrics.LastFloat64("DCGM_FI_DEV_MEM_COPY_UTIL") + v /= 100.0 /* normalize */ + } + if ok { + s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, v) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_PCIE_TX_BYTES"); ok { + s.mb.RecordGpuDcgmPcieIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionTransmit) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_PCIE_RX_BYTES"); ok { + s.mb.RecordGpuDcgmPcieIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionReceive) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_NVLINK_TX_BYTES"); ok { + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionTransmit) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_NVLINK_RX_BYTES"); ok { + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionReceive) + } + i, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION") + v = float64(i) / 1e3 /* mJ to J */ + if !ok { // fallback + i, ok = gpu.Metrics.IntegratedRate("DCGM_FI_DEV_POWER_USAGE") + v = float64(i) + } + if ok { + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_GPU_TEMP"); ok { + s.mb.RecordGpuDcgmTemperatureDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_SM_CLOCK"); ok { + s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, 1e6*v) /* MHz to Hz */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_POWER_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationPower) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_THERMAL_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationThermal) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_SYNC_BOOST_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationSyncBoost) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationBoardLimit) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_LOW_UTIL_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationLowUtil) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_RELIABILITY_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationReliability) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationAppClock) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationBaseClock) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"); ok { + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, v, metadata.AttributeGpuErrorTypeSbe) + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"); ok { + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, v, metadata.AttributeGpuErrorTypeDbe) + } + // TODO: XID errors. + // s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) + s.mb.EmitForResource(metadata.WithResource(gpuResource)) + } + + return s.mb.Emit(), nil } diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index e0a3584f2..393bc1912 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -19,6 +19,7 @@ package dcgmreceiver import ( "context" + "errors" "fmt" "testing" "time" @@ -37,6 +38,19 @@ import ( "github.com/GoogleCloudPlatform/opentelemetry-operations-collector/receiver/dcgmreceiver/testprofilepause" ) +func collectScraperResult(t *testing.T, ctx context.Context, scraper *dcgmScraper) (pmetric.Metrics, error) { + for { + metrics, err := scraper.scrape(ctx) + assert.NoError(t, err) + if metrics.MetricCount() > 0 { + // We expect cumulative metrics to be missing on the first scrape. + time.Sleep(scrapePollingInterval) + return scraper.scrape(ctx) + } + time.Sleep(scrapePollingInterval) + } +} + func TestScrapeWithGpuPresent(t *testing.T) { var settings receiver.CreateSettings settings.Logger = zaptest.NewLogger(t) @@ -47,16 +61,60 @@ func TestScrapeWithGpuPresent(t *testing.T) { err := scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) - metrics, err := scraper.scrape(context.Background()) - expectedMetrics := loadExpectedScraperMetrics(t, scraper.client.getDeviceModelName(0)) - validateScraperResult(t, metrics, expectedMetrics) + metrics, err := collectScraperResult(t, context.Background(), scraper) + assert.NoError(t, err) + + assert.NoError(t, scraper.stop(context.Background())) + + validateScraperResult(t, metrics) +} + +func TestScrapeCollectionInterval(t *testing.T) { + var settings receiver.CreateSettings + settings.Logger = zaptest.NewLogger(t) + + var fetchCount int + + realDcgmGetValuesSince := dcgmGetValuesSince + defer func() { dcgmGetValuesSince = realDcgmGetValuesSince }() + dcgmGetValuesSince = func(g dcgm.GroupHandle, f dcgm.FieldHandle, t time.Time) ([]dcgm.FieldValue_v2, time.Time, error) { + fetchCount++ + return realDcgmGetValuesSince(g, f, t) + } + + scraper := newDcgmScraper(createDefaultConfig().(*Config), settings) + require.NotNil(t, scraper) + + err := scraper.start(context.Background(), componenttest.NewNopHost()) + require.NoError(t, err) + + // We expect to scrape every maxKeepSamples * scrapePollingInterval / 2. + // Wait long enough that we expect three scrapes. + const sleepTime = 3.5 * maxKeepSamples * scrapePollingInterval / 2 + + time.Sleep(sleepTime) + + metrics, err := collectScraperResult(t, context.Background(), scraper) + assert.NoError(t, err) + + assert.NoError(t, scraper.stop(context.Background())) + + // We should have seen 1 initial scrape + 3 timed scrapes + 2 scrapes triggered by `collectScraperResult`. + assert.Less(t, fetchCount, 7, "too many fetches") + + validateScraperResult(t, metrics) } func TestScrapeWithDelayedDcgmService(t *testing.T) { realDcgmInit := dcgmInit defer func() { dcgmInit = realDcgmInit }() + failures := 2 dcgmInit = func(args ...string) (func(), error) { - return nil, fmt.Errorf("No DCGM client library *OR* No DCGM connection") + if failures > 0 { + failures-- + return nil, fmt.Errorf("No DCGM client library *OR* No DCGM connection") + } + return realDcgmInit(args...) } var settings receiver.CreateSettings @@ -65,24 +123,21 @@ func TestScrapeWithDelayedDcgmService(t *testing.T) { scraper := newDcgmScraper(createDefaultConfig().(*Config), settings) require.NotNil(t, scraper) + scraper.initRetryDelay = 0 // retry immediately + err := scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) - metrics, err := scraper.scrape(context.Background()) - assert.NoError(t, err) // If failed to init DCGM, should have no error - assert.Equal(t, metrics.MetricCount(), 0) - - // Scrape again with DCGM not available - metrics, err = scraper.scrape(context.Background()) + // Simulate DCGM becomes available after 3 attempts + // scrape should block until DCGM is available + metrics, err := collectScraperResult(t, context.Background(), scraper) assert.NoError(t, err) - assert.Equal(t, metrics.MetricCount(), 0) - // Simulate DCGM becomes available - dcgmInit = realDcgmInit - metrics, err = scraper.scrape(context.Background()) - assert.NoError(t, err) - expectedMetrics := loadExpectedScraperMetrics(t, scraper.client.getDeviceModelName(0)) - validateScraperResult(t, metrics, expectedMetrics) + assert.NoError(t, scraper.stop(context.Background())) + + assert.Equal(t, 0, failures) + + validateScraperResult(t, metrics) } func TestScrapeWithEmptyMetricsConfig(t *testing.T) { @@ -96,28 +151,52 @@ func TestScrapeWithEmptyMetricsConfig(t *testing.T) { Endpoint: defaultEndpoint, }, Metrics: metadata.MetricsConfig{ - DcgmGpuMemoryBytesUsed: metadata.MetricConfig{ + GpuDcgmClockFrequency: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmClockThrottleDurationTime: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmCodecDecoderUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmCodecEncoderUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmEccErrors: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmEnergyConsumption: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingDramUtilization: metadata.MetricConfig{ + GpuDcgmMemoryBandwidthUtilization: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingNvlinkTrafficRate: metadata.MetricConfig{ + GpuDcgmMemoryBytesUsed: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingPcieTrafficRate: metadata.MetricConfig{ + GpuDcgmNvlinkIo: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingPipeUtilization: metadata.MetricConfig{ + GpuDcgmPcieIo: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingSmOccupancy: metadata.MetricConfig{ + GpuDcgmPipeUtilization: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingSmUtilization: metadata.MetricConfig{ + GpuDcgmSmOccupancy: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuUtilization: metadata.MetricConfig{ + GpuDcgmSmUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmTemperature: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmXidErrors: metadata.MetricConfig{ Enabled: false, }, }, @@ -131,14 +210,16 @@ func TestScrapeWithEmptyMetricsConfig(t *testing.T) { metrics, err := scraper.scrape(context.Background()) assert.NoError(t, err) - assert.Equal(t, metrics.MetricCount(), 0) + assert.Equal(t, 0, metrics.MetricCount()) + + assert.NoError(t, scraper.stop(context.Background())) } func TestScrapeOnPollingError(t *testing.T) { - realDcgmGetLatestValuesForFields := dcgmGetLatestValuesForFields - defer func() { dcgmGetLatestValuesForFields = realDcgmGetLatestValuesForFields }() - dcgmGetLatestValuesForFields = func(gpu uint, fields []dcgm.Short) ([]dcgm.FieldValue_v1, error) { - return nil, fmt.Errorf("DCGM polling error") + realDcgmGetValuesSince := dcgmGetValuesSince + defer func() { dcgmGetValuesSince = realDcgmGetValuesSince }() + dcgmGetValuesSince = func(_ dcgm.GroupHandle, _ dcgm.FieldHandle, _ time.Time) ([]dcgm.FieldValue_v2, time.Time, error) { + return nil, time.Time{}, fmt.Errorf("DCGM polling error") } var settings receiver.CreateSettings @@ -152,8 +233,10 @@ func TestScrapeOnPollingError(t *testing.T) { metrics, err := scraper.scrape(context.Background()) - assert.Error(t, err) - assert.Equal(t, metrics.MetricCount(), 0) + assert.NoError(t, err) + assert.Equal(t, 0, metrics.MetricCount()) + + assert.NoError(t, scraper.stop(context.Background())) } func TestScrapeOnProfilingPaused(t *testing.T) { @@ -166,32 +249,53 @@ func TestScrapeOnProfilingPaused(t *testing.T) { scraper := newDcgmScraper(config, settings) require.NotNil(t, scraper) - defer func() { testprofilepause.ResumeProfilingMetrics() }() - testprofilepause.PauseProfilingMetrics() + defer testprofilepause.ResumeProfilingMetrics(config.TCPAddrConfig.Endpoint) + err := testprofilepause.PauseProfilingMetrics(config.TCPAddrConfig.Endpoint) + if errors.Is(err, testprofilepause.FeatureNotSupportedError) { + t.Skipf("Pausing profiling not supported") + } else if err != nil { + t.Fatalf("Pausing profiling failed with error %v", err) + } time.Sleep(20 * time.Millisecond) - err := scraper.start(context.Background(), componenttest.NewNopHost()) + err = scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) - metrics, err := scraper.scrape(context.Background()) + metrics, err := collectScraperResult(t, context.Background(), scraper) assert.NoError(t, err) - require.Equal(t, metrics.MetricCount(), 2) + + assert.NoError(t, scraper.stop(context.Background())) expectedMetrics := []string{ - "dcgm.gpu.utilization", - "dcgm.gpu.memory.bytes_used", + "gpu.dcgm.utilization", + "gpu.dcgm.codec.decoder.utilization", + "gpu.dcgm.codec.encoder.utilization", + "gpu.dcgm.memory.bytes_used", + "gpu.dcgm.memory.bandwidth_utilization", + "gpu.dcgm.energy_consumption", + "gpu.dcgm.temperature", + "gpu.dcgm.clock.frequency", + "gpu.dcgm.clock.throttle_duration.time", + "gpu.dcgm.ecc_errors", } - ms := metrics.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics() + require.Greater(t, metrics.ResourceMetrics().Len(), 0) + + ilms := metrics.ResourceMetrics().At(0).ScopeMetrics() + require.Equal(t, 1, ilms.Len()) + + ms := ilms.At(0).Metrics() metricWasSeen := make(map[string]bool) for i := 0; i < ms.Len(); i++ { metricWasSeen[ms.At(i).Name()] = true } for _, metric := range expectedMetrics { - assert.Equal(t, metricWasSeen[metric], true) + assert.True(t, metricWasSeen[metric], metric) + delete(metricWasSeen, metric) } + assert.Equal(t, len(expectedMetrics), ms.Len(), fmt.Sprintf("%v", metricWasSeen)) } // loadExpectedScraperMetrics calls LoadExpectedMetrics to read the supported @@ -201,30 +305,59 @@ func loadExpectedScraperMetrics(t *testing.T, model string) map[string]int { t.Helper() expectedMetrics := make(map[string]int) receiverMetricNameToScraperMetricName := map[string]string{ - "dcgm.gpu.utilization": "dcgm.gpu.utilization", - "dcgm.gpu.memory.bytes_used": "dcgm.gpu.memory.bytes_used", - "dcgm.gpu.memory.bytes_free": "dcgm.gpu.memory.bytes_used", - "dcgm.gpu.profiling.sm_utilization": "dcgm.gpu.profiling.sm_utilization", - "dcgm.gpu.profiling.sm_occupancy": "dcgm.gpu.profiling.sm_occupancy", - "dcgm.gpu.profiling.dram_utilization": "dcgm.gpu.profiling.dram_utilization", - "dcgm.gpu.profiling.tensor_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.fp64_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.fp32_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.fp16_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.pcie_sent_bytes": "dcgm.gpu.profiling.pcie_traffic_rate", - "dcgm.gpu.profiling.pcie_received_bytes": "dcgm.gpu.profiling.pcie_traffic_rate", - "dcgm.gpu.profiling.nvlink_sent_bytes": "dcgm.gpu.profiling.nvlink_traffic_rate", - "dcgm.gpu.profiling.nvlink_received_bytes": "dcgm.gpu.profiling.nvlink_traffic_rate", + "DCGM_FI_PROF_GR_ENGINE_ACTIVE": "gpu.dcgm.utilization", + //"DCGM_FI_DEV_GPU_UTIL": "gpu.dcgm.utilization", + "DCGM_FI_PROF_SM_ACTIVE": "gpu.dcgm.sm.utilization", + "DCGM_FI_PROF_SM_OCCUPANCY": "gpu.dcgm.sm.occupancy", + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_DEV_ENC_UTIL": "gpu.dcgm.codec.encoder.utilization", + "DCGM_FI_DEV_DEC_UTIL": "gpu.dcgm.codec.decoder.utilization", + "DCGM_FI_DEV_FB_FREE": "gpu.dcgm.memory.bytes_used", + "DCGM_FI_DEV_FB_USED": "gpu.dcgm.memory.bytes_used", + "DCGM_FI_DEV_FB_RESERVED": "gpu.dcgm.memory.bytes_used", + "DCGM_FI_PROF_DRAM_ACTIVE": "gpu.dcgm.memory.bandwidth_utilization", + //"DCGM_FI_DEV_MEM_COPY_UTIL": "gpu.dcgm.memory.bandwidth_utilization", + "DCGM_FI_PROF_PCIE_TX_BYTES": "gpu.dcgm.pcie.io", + "DCGM_FI_PROF_PCIE_RX_BYTES": "gpu.dcgm.pcie.io", + "DCGM_FI_PROF_NVLINK_TX_BYTES": "gpu.dcgm.nvlink.io", + "DCGM_FI_PROF_NVLINK_RX_BYTES": "gpu.dcgm.nvlink.io", + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": "gpu.dcgm.energy_consumption", + //"DCGM_FI_DEV_POWER_USAGE": "gpu.dcgm.energy_consumption", + "DCGM_FI_DEV_GPU_TEMP": "gpu.dcgm.temperature", + "DCGM_FI_DEV_SM_CLOCK": "gpu.dcgm.clock.frequency", + "DCGM_FI_DEV_POWER_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_THERMAL_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_LOW_UTIL_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_RELIABILITY_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": "gpu.dcgm.ecc_errors", + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": "gpu.dcgm.ecc_errors", } - expectedReceiverMetrics := LoadExpectedMetrics(t, model) - for _, em := range expectedReceiverMetrics { - expectedMetrics[receiverMetricNameToScraperMetricName[em]] += 1 + supportedFields := LoadExpectedMetrics(t, model) + for _, em := range supportedFields.SupportedFields { + scraperMetric := receiverMetricNameToScraperMetricName[em] + if scraperMetric != "" { + expectedMetrics[scraperMetric] += 1 + } + // TODO: fallbacks. } return expectedMetrics } -func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetrics map[string]int) { +func validateScraperResult(t *testing.T, metrics pmetric.Metrics) { t.Helper() + rms := metrics.ResourceMetrics() + require.NotEmpty(t, rms.Len(), "missing ResourceMetrics") + modelValue, ok := rms.At(0).Resource().Attributes().Get("gpu.model") + require.True(t, ok, "missing gpu.model resource attribute") + expectedMetrics := loadExpectedScraperMetrics(t, modelValue.Str()) + metricWasSeen := make(map[string]bool) expectedDataPointCount := 0 for metric, expectedMetricDataPoints := range expectedMetrics { @@ -232,8 +365,13 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric expectedDataPointCount += expectedMetricDataPoints } - assert.LessOrEqual(t, len(expectedMetrics), metrics.MetricCount()) - assert.LessOrEqual(t, expectedDataPointCount, metrics.DataPointCount()) + assert.LessOrEqual(t, len(expectedMetrics), metrics.MetricCount(), "metric count") + assert.LessOrEqual(t, expectedDataPointCount, metrics.DataPointCount(), "data point count") + + r := metrics.ResourceMetrics().At(0).Resource() + assert.Contains(t, r.Attributes().AsRaw(), "gpu.number") + assert.Contains(t, r.Attributes().AsRaw(), "gpu.uuid") + assert.Contains(t, r.Attributes().AsRaw(), "gpu.model") ilms := metrics.ResourceMetrics().At(0).ScopeMetrics() require.Equal(t, 1, ilms.Len()) @@ -241,34 +379,83 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric ms := ilms.At(0).Metrics() for i := 0; i < ms.Len(); i++ { m := ms.At(i) - dps := m.Gauge().DataPoints() - for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*gpu_number:.*", dps.At(j).Attributes().AsRaw()) - assert.Regexp(t, ".*model:.*", dps.At(j).Attributes().AsRaw()) - assert.Regexp(t, ".*uuid:.*", dps.At(j).Attributes().AsRaw()) - } + var dps pmetric.NumberDataPointSlice + switch m.Name() { + case "gpu.dcgm.utilization": + fallthrough + case "gpu.dcgm.sm.utilization": + fallthrough + case "gpu.dcgm.sm.occupancy": + fallthrough + case "gpu.dcgm.pipe.utilization": + fallthrough + case "gpu.dcgm.codec.encoder.utilization": + fallthrough + case "gpu.dcgm.codec.decoder.utilization": + fallthrough + case "gpu.dcgm.memory.bytes_used": + fallthrough + case "gpu.dcgm.memory.bandwidth_utilization": + fallthrough + case "gpu.dcgm.temperature": + fallthrough + case "gpu.dcgm.clock.frequency": + dps = m.Gauge().DataPoints() + case "gpu.dcgm.energy_consumption": + fallthrough + case "gpu.dcgm.clock.throttle_duration.time": + fallthrough + case "gpu.dcgm.pcie.io": + fallthrough + case "gpu.dcgm.nvlink.io": + fallthrough + case "gpu.dcgm.ecc_errors": + fallthrough + case "gpu.dcgm.xid_errors": + dps = m.Sum().DataPoints() + default: + t.Errorf("Unexpected metric %s", m.Name()) + } assert.LessOrEqual(t, expectedMetrics[m.Name()], dps.Len()) switch m.Name() { - case "dcgm.gpu.utilization": - case "dcgm.gpu.memory.bytes_used": + case "gpu.dcgm.utilization": + case "gpu.dcgm.sm.utilization": + case "gpu.dcgm.sm.occupancy": + case "gpu.dcgm.pipe.utilization": for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*memory_state:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.pipe") } - case "dcgm.gpu.profiling.sm_utilization": - case "dcgm.gpu.profiling.sm_occupancy": - case "dcgm.gpu.profiling.dram_utilization": - case "dcgm.gpu.profiling.pipe_utilization": + case "gpu.dcgm.codec.encoder.utilization": + case "gpu.dcgm.codec.decoder.utilization": + case "gpu.dcgm.memory.bytes_used": for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*pipe:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.memory.state") } - case "dcgm.gpu.profiling.pcie_traffic_rate": + case "gpu.dcgm.memory.bandwidth_utilization": + case "gpu.dcgm.pcie.io": fallthrough - case "dcgm.gpu.profiling.nvlink_traffic_rate": + case "gpu.dcgm.nvlink.io": + for j := 0; j < dps.Len(); j++ { + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "network.io.direction") + } + case "gpu.dcgm.energy_consumption": + case "gpu.dcgm.temperature": + case "gpu.dcgm.clock.frequency": + case "gpu.dcgm.clock.throttle_duration.time": + for j := 0; j < dps.Len(); j++ { + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.clock.violation") + } + case "gpu.dcgm.ecc_errors": for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*direction:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.error.type") } + // TODO + //case "gpu.dcgm.xid_errors": + // for j := 0; j < dps.Len(); j++ { + // assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.error.xid") + // } default: t.Errorf("Unexpected metric %s", m.Name()) } @@ -277,6 +464,6 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric } for metric := range expectedMetrics { - assert.Equal(t, metricWasSeen[metric], true) + assert.True(t, metricWasSeen[metric], metric) } } diff --git a/receiver/dcgmreceiver/scraper_test.go b/receiver/dcgmreceiver/scraper_test.go index b4900aaa1..55b58f65f 100644 --- a/receiver/dcgmreceiver/scraper_test.go +++ b/receiver/dcgmreceiver/scraper_test.go @@ -20,6 +20,7 @@ package dcgmreceiver import ( "context" "strings" + "sync" "testing" "github.com/stretchr/testify/assert" @@ -33,10 +34,13 @@ import ( func TestScraperWithoutDcgm(t *testing.T) { var settings receiver.CreateSettings + var mu sync.Mutex seenDcgmNotInstalledWarning := false settings.Logger = zaptest.NewLogger(t, zaptest.WrapOptions(zap.Hooks(func(e zapcore.Entry) error { if e.Level == zap.WarnLevel && strings.Contains(e.Message, "Unable to connect to DCGM daemon at localhost:5555 on libdcgm.so not Found; Is the DCGM daemon running") { + mu.Lock() seenDcgmNotInstalledWarning = true + mu.Unlock() } return nil }))) @@ -48,13 +52,17 @@ func TestScraperWithoutDcgm(t *testing.T) { require.NoError(t, err) metrics, err := scraper.scrape(context.Background()) + mu.Lock() assert.Equal(t, true, seenDcgmNotInstalledWarning) + mu.Unlock() assert.NoError(t, err) // If failed to init DCGM, should have no error assert.Equal(t, 0, metrics.MetricCount()) // Scrape again with DCGM not available metrics, err = scraper.scrape(context.Background()) + mu.Lock() assert.Equal(t, true, seenDcgmNotInstalledWarning) + mu.Unlock() assert.NoError(t, err) assert.Equal(t, 0, metrics.MetricCount()) diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml index 230ab0c17..30b24a858 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml @@ -1,17 +1,35 @@ model: NVIDIA A100-SXM4-40GB supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml new file mode 100644 index 000000000..4c9dd91b4 --- /dev/null +++ b/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml @@ -0,0 +1,35 @@ +model: NVIDIA H100 80GB HBM3 +supported_fields: + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE +unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml index ff81429c2..16ba2008d 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml @@ -1,17 +1,35 @@ model: NVIDIA L4 supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: - DCGM_FI_PROF_PIPE_FP64_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_K80.yaml b/receiver/dcgmreceiver/testdata/Tesla_K80.yaml deleted file mode 100644 index 1ddf5ea1f..000000000 --- a/receiver/dcgmreceiver/testdata/Tesla_K80.yaml +++ /dev/null @@ -1,17 +0,0 @@ -model: Tesla K80 -supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED - - DCGM_FI_DEV_FB_FREE -unsupported_fields: - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES diff --git a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml index 729a6f39c..f2986c873 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml @@ -1,17 +1,35 @@ model: Tesla P100-PCIE-16GB supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION unsupported_fields: - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml index 9b115f49a..052302234 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml @@ -1,17 +1,35 @@ model: Tesla P4 supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION unsupported_fields: - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml index 37a066b37..e63ae2d89 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml @@ -1,17 +1,35 @@ model: Tesla T4 supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml index aec19e80c..903ed6130 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml @@ -1,17 +1,35 @@ model: Tesla V100-SXM2-16GB supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go b/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go index 15a329095..3700382e9 100644 --- a/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go +++ b/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go @@ -22,9 +22,10 @@ package testprofilepause /* #include typedef uintptr_t dcgmHandle_t; -typedef enum dcgmReturn_enum { DCGM_ST_OK = 0 } dcgmReturn_t; +typedef enum dcgmReturn_enum { DCGM_ST_OK = 0, DCGM_ST_NOT_SUPPORTED = -6 } dcgmReturn_t; dcgmReturn_t dcgmProfPause(dcgmHandle_t pDcgmHandle); dcgmReturn_t dcgmProfResume(dcgmHandle_t pDcgmHandle); +const char *errorString(dcgmReturn_t result); */ import "C" import ( @@ -39,17 +40,54 @@ type dcgmHandle struct{ handle C.dcgmHandle_t } //go:linkname handle github.com/NVIDIA/go-dcgm/pkg/dcgm.handle var handle dcgmHandle -func PauseProfilingMetrics() { +var errorMap = map[C.dcgmReturn_t]error{ + C.DCGM_ST_OK: nil, +} + +func errorString(result C.dcgmReturn_t) error { + if err, ok := errorMap[result]; ok { + return err + } + msg := C.GoString(C.errorString(result)) + err := fmt.Errorf("%v", msg) + errorMap[result] = err + return err +} + +var FeatureNotSupportedError error +var initErrors = func() { + if FeatureNotSupportedError == nil { + FeatureNotSupportedError = errorString(C.DCGM_ST_NOT_SUPPORTED) + } +} + +func PauseProfilingMetrics(endpoint string) error { + initErrors() + cleanup, err := dcgm.Init(dcgm.Standalone, endpoint, "0") + if err != nil { + return err + } + defer cleanup() result := C.dcgmProfPause(handle.handle) - if result != 0 { - fmt.Printf("CUDA version %d", dcgm.DCGM_FI_CUDA_DRIVER_VERSION) - fmt.Printf("Failed to pause profiling (result %d)\n", result) + err = errorString(result) + if err != nil { + fmt.Printf("CUDA version %d\n", dcgm.DCGM_FI_CUDA_DRIVER_VERSION) + fmt.Printf("Failed to pause profiling (%v)\n", err) } + return err } -func ResumeProfilingMetrics() { +func ResumeProfilingMetrics(endpoint string) error { + initErrors() + cleanup, err := dcgm.Init(dcgm.Standalone, endpoint, "0") + if err != nil { + return err + } + defer cleanup() result := C.dcgmProfResume(handle.handle) - if result != 0 { - fmt.Printf("Failed to resume profiling (result %d)\n", result) + err = errorString(result) + if err != nil { + fmt.Printf("Failed to resume profiling (%v)\n", err) } + return err } diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 0ee795a4d..b33317531 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -18,65 +18,147 @@ package dcgmreceiver import ( - "unsafe" + "fmt" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) -func (m *dcgmMetric) setFloat64(val float64) { - *(*float64)(unsafe.Pointer(&m.value[0])) = val +// For each metric, we need to track: +type metricStats struct { + // Timestamp (µs) + // Last value (for gauge metrics), as int64 or double + lastFieldValue *dcgm.FieldValue_v2 + // Integrated rate (always int), as {unit-seconds,unit-microseconds} + // This is intended for metrics that have a per-second unit, such as By/s. + // The metric value is multiplied by the timestamp delta, producing us.By/s in integratedRateMicroseconds + // When that overflows past 1e6, the overflow is put in integratedRateSeconds, which is in units of s.By/s, or just By. + integratedRateSeconds int64 + integratedRateMicroseconds int64 + // Cumulative value (always int) + initialCumulativeValue int64 + cumulativeValue int64 } -func (m *dcgmMetric) asFloat64() float64 { - return *(*float64)(unsafe.Pointer(&m.value[0])) +func asInt64(fieldValue dcgm.FieldValue_v2) (int64, bool) { + // TODO: dcgm's Float64 and Int64 use undefined behavior + switch fieldValue.FieldType { + case dcgm.DCGM_FT_DOUBLE: + return int64(fieldValue.Float64()), true + case dcgm.DCGM_FT_INT64: + return fieldValue.Int64(), true + } + return 0, false } -func (m *dcgmMetric) setInt64(val int64) { - *(*int64)(unsafe.Pointer(&m.value[0])) = val +func asFloat64(fieldValue dcgm.FieldValue_v2) (float64, bool) { + switch fieldValue.FieldType { + case dcgm.DCGM_FT_DOUBLE: + return fieldValue.Float64(), true + case dcgm.DCGM_FT_INT64: + return float64(fieldValue.Int64()), true + } + return 0, false } -func (m *dcgmMetric) asInt64() int64 { - return *(*int64)(unsafe.Pointer(&m.value[0])) +func (m *metricStats) Update(fieldValue dcgm.FieldValue_v2) { + ts := fieldValue.Ts + intValue, intOk := asInt64(fieldValue) + if !intOk { + return + } + if m.lastFieldValue == nil { + m.initialCumulativeValue = intValue + } else { + if m.lastFieldValue.Ts >= ts { + return + } + m.cumulativeValue = intValue - m.initialCumulativeValue + + tsDelta := ts - m.lastFieldValue.Ts + if fieldValue.FieldType == dcgm.DCGM_FT_DOUBLE { + m.integratedRateMicroseconds += int64(float64(tsDelta) * fieldValue.Float64()) + } else { + m.integratedRateMicroseconds += tsDelta * intValue + } + m.integratedRateSeconds += m.integratedRateMicroseconds / 1000000 + m.integratedRateMicroseconds %= 1000000 + } + m.lastFieldValue = &fieldValue } -func isValidValue(fieldValue dcgm.FieldValue_v1) bool { +type MetricsMap map[string]*metricStats + +func (m MetricsMap) LastFloat64(name string) (float64, bool) { + if metric, ok := m[name]; ok && metric.lastFieldValue != nil { + return asFloat64(*metric.lastFieldValue) + } + return 0, false +} +func (m MetricsMap) LastInt64(name string) (int64, bool) { + if metric, ok := m[name]; ok && metric.lastFieldValue != nil { + return asInt64(*metric.lastFieldValue) + } + return 0, false +} +func (m MetricsMap) IntegratedRate(name string) (int64, bool) { + if metric, ok := m[name]; ok { + return metric.integratedRateSeconds, true + } + return 0, false +} +func (m MetricsMap) CumulativeTotal(name string) (int64, bool) { + if metric, ok := m[name]; ok { + return metric.cumulativeValue, true + } + return 0, false +} + +var ( + errBlankValue = fmt.Errorf("unspecified blank value") + errDataNotFound = fmt.Errorf("data not found") + errNotSupported = fmt.Errorf("field not supported") + errPermissionDenied = fmt.Errorf("no permission to fetch value") + errUnexpectedType = fmt.Errorf("unexpected data type") +) + +func isValidValue(fieldValue dcgm.FieldValue_v2) error { switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: switch v := fieldValue.Float64(); v { case dcgm.DCGM_FT_FP64_BLANK: - return false + return errBlankValue case dcgm.DCGM_FT_FP64_NOT_FOUND: - return false + return errDataNotFound case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: - return false + return errNotSupported case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: - return false + return errPermissionDenied } case dcgm.DCGM_FT_INT64: switch v := fieldValue.Int64(); v { case dcgm.DCGM_FT_INT32_BLANK: - return false + return errBlankValue case dcgm.DCGM_FT_INT32_NOT_FOUND: - return false + return errDataNotFound case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: - return false + return errNotSupported case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: - return false + return errPermissionDenied case dcgm.DCGM_FT_INT64_BLANK: - return false + return errBlankValue case dcgm.DCGM_FT_INT64_NOT_FOUND: - return false + return errDataNotFound case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: - return false + return errNotSupported case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: - return false + return errPermissionDenied } // dcgm.DCGM_FT_STRING also exists but we don't expect it default: - return false + return errUnexpectedType } - return true + return nil } diff --git a/receiver/dcgmreceiver/util_test.go b/receiver/dcgmreceiver/util_test.go index daeace14d..a9a206afc 100644 --- a/receiver/dcgmreceiver/util_test.go +++ b/receiver/dcgmreceiver/util_test.go @@ -1,4 +1,4 @@ -// Copyright 2023 Google LLC +// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,23 +18,116 @@ package dcgmreceiver import ( + "bytes" + "encoding/binary" "testing" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func TestDcgmMetricSetFloat64(t *testing.T) { - var metric dcgmMetric - metric.setFloat64(23.0) - require.Equal(t, metric.asFloat64(), 23.0) - metric.setFloat64(43.0) - require.Equal(t, metric.asFloat64(), 43.0) +func fieldValue(t *testing.T, ts int64, fieldType uint, value any) dcgm.FieldValue_v2 { + buf := new(bytes.Buffer) + require.NoError(t, binary.Write(buf, binary.NativeEndian, value)) + var valueArr [4096]byte + copy(valueArr[:], buf.Bytes()) + return dcgm.FieldValue_v2{ + Ts: ts, + FieldType: fieldType, + Value: valueArr, + } } -func TestDcgmMetricSetInt64(t *testing.T) { - var metric dcgmMetric - metric.setInt64(23) - require.Equal(t, metric.asInt64(), int64(23)) - metric.setInt64(43) - require.Equal(t, metric.asInt64(), int64(43)) +func fieldValueInt64(t *testing.T, ts int64, value int64) dcgm.FieldValue_v2 { + return fieldValue(t, ts, dcgm.DCGM_FT_INT64, value) +} + +func fieldValueFloat64(t *testing.T, ts int64, value float64) dcgm.FieldValue_v2 { + return fieldValue(t, ts, dcgm.DCGM_FT_DOUBLE, value) +} + +func testMetricStatsRate[V int64 | float64](t *testing.T, fv func(*testing.T, int64, V) dcgm.FieldValue_v2) { + stats := &metricStats{} + + type P struct { + ts int64 + v int64 + } + p := func(stats *metricStats) P { + if stats.lastFieldValue == nil { + return P{0, stats.integratedRateSeconds} + } + return P{stats.lastFieldValue.Ts, stats.integratedRateSeconds} + } + + stats.Update(fv(t, 10, 0)) + require.Equal(t, P{10, 0}, p(stats)) + // Ensure updates affect aggregated values. + stats.Update(fv(t, 15, 1e6)) + assert.Equal(t, P{15, 5}, p(stats)) + // Ensure stale points are ignored. + stats.Update(fv(t, 12, 1e8)) + assert.Equal(t, P{15, 5}, p(stats)) + stats.Update(fv(t, 15, 1.e8)) + assert.Equal(t, P{15, 5}, p(stats)) + // Ensure updates affect aggregated values. + stats.Update(fv(t, 20, 2.e6)) + assert.Equal(t, P{20, 15}, p(stats)) + // Ensure zero rates don't change the aggregated value. + stats.Update(fv(t, 25, 0)) + assert.Equal(t, P{25, 15}, p(stats)) +} + +func TestMetricStatsRateInt64(t *testing.T) { + testMetricStatsRate[int64](t, fieldValueInt64) +} + +func TestMetricStatsRateFloat64(t *testing.T) { + testMetricStatsRate[float64](t, fieldValueFloat64) +} + +func testMetricStatsCumulative[V int64 | float64](t *testing.T, fv func(*testing.T, int64, V) dcgm.FieldValue_v2) { + stats := &metricStats{} + + type P struct { + ts int64 + v int64 + } + p := func(stats *metricStats) P { + if stats.lastFieldValue == nil { + return P{0, stats.cumulativeValue} + } + return P{stats.lastFieldValue.Ts, stats.cumulativeValue} + } + + require.Equal(t, int64(0), stats.initialCumulativeValue) + require.Equal(t, P{0, 0}, p(stats)) + // Ensure first updates sets the baseline. + stats.Update(fv(t, 15, 50)) + require.Equal(t, int64(50), stats.initialCumulativeValue) + assert.Equal(t, P{15, 0}, p(stats)) + // Ensure updates affect values, but not the baseline. + stats.Update(fv(t, 20, 80)) + assert.Equal(t, int64(50), stats.initialCumulativeValue) + assert.Equal(t, P{20, 30}, p(stats)) + // Ensure stale points are ignored. + stats.Update(fv(t, 18, 1e8)) + assert.Equal(t, P{20, 30}, p(stats)) + stats.Update(fv(t, 20, 1e8)) + assert.Equal(t, P{20, 30}, p(stats)) + // Ensure updates affect values. + stats.Update(fv(t, 25, 100)) + assert.Equal(t, P{25, 50}, p(stats)) + // Ensure same inputs don't affect values. + stats.Update(fv(t, 30, 100)) + assert.Equal(t, P{30, 50}, p(stats)) +} + +func TestMetricStatsCumulativeInt64(t *testing.T) { + testMetricStatsCumulative[int64](t, fieldValueInt64) +} + +func TestMetricStatsCumulativeFloat64(t *testing.T) { + testMetricStatsCumulative[float64](t, fieldValueFloat64) } diff --git a/service/components.go b/service/components.go index e6b46bbc2..60b84f5c0 100644 --- a/service/components.go +++ b/service/components.go @@ -18,6 +18,8 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/fileexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlecloudexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlemanagedprometheusexporter" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" @@ -141,6 +143,8 @@ func components() (otelcol.Factories, error) { processors := []processor.Factory{ agentmetricsprocessor.NewFactory(), casttosumprocessor.NewFactory(), + cumulativetodeltaprocessor.NewFactory(), + deltatorateprocessor.NewFactory(), filterprocessor.NewFactory(), normalizesumsprocessor.NewFactory(), metricstransformprocessor.NewFactory(),