diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 6c1122294..1a2e97c7b 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -384,9 +384,11 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) var err scrapererror.ScrapeErrors gpuMetrics := make(map[uint][]dcgmMetric) for _, gpuIndex := range client.deviceIndices { + client.logger.Debugf("Polling DCGM daemon for GPU %d", gpuIndex) retry := true for i := 0; retry && i < client.maxRetries; i++ { fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) + client.logger.Debugf("Got %d field values", len(fieldValues)) if pollErr == nil { gpuMetrics[gpuIndex], retry = client.appendMetrics(gpuMetrics[gpuIndex], gpuIndex, fieldValues) if retry { diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index efbede894..69fd033bf 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -182,7 +182,13 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { return s.mb.Emit(), err } + s.settings.Logger.Sugar().Debug("Client created, collecting metrics") deviceMetrics, err := s.client.collectDeviceMetrics() + if err != nil { + s.settings.Logger.Sugar().Warnf("Metrics not collected; err=%v", err) + return s.mb.Emit(), err + } + s.settings.Logger.Sugar().Debugf("Metrics collected: %d", len(deviceMetrics)) now := pcommon.NewTimestampFromTime(time.Now()) for gpuIndex, gpuMetrics := range deviceMetrics { @@ -190,6 +196,7 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { for _, metric := range gpuMetrics { metricsByName[metric.name] = append(metricsByName[metric.name], metric) } + s.settings.Logger.Sugar().Debugf("Got %d unique metrics: %v", len(metricsByName), metricsByName) metrics := make(map[string]dcgmMetric) for name, points := range metricsByName { slices.SortStableFunc(points, func(a, b dcgmMetric) int {