From f88fe9c6e11e99509cd04a4c16f134a8d271137e Mon Sep 17 00:00:00 2001 From: kaiyan-sheng Date: Mon, 27 May 2019 08:32:23 -0600 Subject: [PATCH] Cherry-pick to 7.0: Ignore prometheus metrics when their values are NaN or Inf (#12084) (#12286) * Ignore prometheus metrics when their values are NaN or Inf (#12084) * Ignore prometheus metrics when their values are NaN or Inf * Avoid NaN/Inf in prometheus helper * Add checks on Gauge, Summary and Counter * Add NaN/Inf check on histogram values (cherry picked from commit 9244477294e3494723c0e76cbbe4851d9c451fba) * Fix changelog --- CHANGELOG.next.asciidoc | 1 + metricbeat/helper/prometheus/metric.go | 31 ++- metricbeat/helper/prometheus/prometheus.go | 6 +- .../helper/prometheus/prometheus_test.go | 93 +++++++- .../_meta/testdata/metrics-with-naninf.plain | 32 +++ .../metrics-with-naninf.plain-expected.json | 222 ++++++++++++++++++ .../module/prometheus/collector/data.go | 66 +++--- 7 files changed, 411 insertions(+), 40 deletions(-) create mode 100644 metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain create mode 100644 metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain-expected.json diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index f394fda4347..19fe5489979 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -48,6 +48,7 @@ https://github.com/elastic/beats/compare/v7.0.0...7.0[Check the HEAD diff] - Update documentation with cloudwatch:ListMetrics permission. {pull}11987[11987] - Change some field type from scaled_float to long in aws module. {pull}11982[11982] +- Ignore prometheus metrics when their values are NaN or Inf. {pull}12084[12084] {issue}10849[10849] *Packetbeat* diff --git a/metricbeat/helper/prometheus/metric.go b/metricbeat/helper/prometheus/metric.go index 716ea1a34af..4b6c424e4b3 100644 --- a/metricbeat/helper/prometheus/metric.go +++ b/metricbeat/helper/prometheus/metric.go @@ -128,28 +128,33 @@ func (m *commonMetric) GetField() string { func (m *commonMetric) GetValue(metric *dto.Metric) interface{} { counter := metric.GetCounter() if counter != nil { - return int64(counter.GetValue()) + if !math.IsNaN(counter.GetValue()) && !math.IsInf(counter.GetValue(), 0) { + return int64(counter.GetValue()) + } } gauge := metric.GetGauge() if gauge != nil { - return gauge.GetValue() + if !math.IsNaN(gauge.GetValue()) && !math.IsInf(gauge.GetValue(), 0) { + return gauge.GetValue() + } } summary := metric.GetSummary() if summary != nil { value := common.MapStr{} - value["sum"] = summary.GetSampleSum() - value["count"] = summary.GetSampleCount() + if !math.IsNaN(summary.GetSampleSum()) && !math.IsInf(summary.GetSampleSum(), 0) { + value["sum"] = summary.GetSampleSum() + value["count"] = summary.GetSampleCount() + } quantiles := summary.GetQuantile() percentileMap := common.MapStr{} for _, quantile := range quantiles { - if !math.IsNaN(quantile.GetValue()) { - key := strconv.FormatFloat((100 * quantile.GetQuantile()), 'f', -1, 64) + if !math.IsNaN(quantile.GetValue()) && !math.IsInf(quantile.GetValue(), 0) { + key := strconv.FormatFloat(100*quantile.GetQuantile(), 'f', -1, 64) percentileMap[key] = quantile.GetValue() } - } if len(percentileMap) != 0 { @@ -162,14 +167,18 @@ func (m *commonMetric) GetValue(metric *dto.Metric) interface{} { histogram := metric.GetHistogram() if histogram != nil { value := common.MapStr{} - value["sum"] = histogram.GetSampleSum() - value["count"] = histogram.GetSampleCount() + if !math.IsNaN(histogram.GetSampleSum()) && !math.IsInf(histogram.GetSampleSum(), 0) { + value["sum"] = histogram.GetSampleSum() + value["count"] = histogram.GetSampleCount() + } buckets := histogram.GetBucket() bucketMap := common.MapStr{} for _, bucket := range buckets { - key := strconv.FormatFloat(bucket.GetUpperBound(), 'f', -1, 64) - bucketMap[key] = bucket.GetCumulativeCount() + if bucket.GetCumulativeCount() != uint64(math.NaN()) && bucket.GetCumulativeCount() != uint64(math.Inf(0)) { + key := strconv.FormatFloat(bucket.GetUpperBound(), 'f', -1, 64) + bucketMap[key] = bucket.GetCumulativeCount() + } } if len(bucketMap) != 0 { diff --git a/metricbeat/helper/prometheus/prometheus.go b/metricbeat/helper/prometheus/prometheus.go index 27456b09f4b..442c2e5a65a 100644 --- a/metricbeat/helper/prometheus/prometheus.go +++ b/metricbeat/helper/prometheus/prometheus.go @@ -160,7 +160,11 @@ func (p *prometheus) GetProcessedMetrics(mapping *MetricsMapping) ([]common.MapS if field != "" { // Put it in the event if it's a common metric event := getEvent(eventsMap, keyLabels) - event.Put(field, value) + update := common.MapStr{} + update.Put(field, value) + // value may be a mapstr (for histograms and summaries), do a deep update to avoid smashing existing fields + event.DeepUpdate(update) + event.DeepUpdate(labels) } } diff --git a/metricbeat/helper/prometheus/prometheus_test.go b/metricbeat/helper/prometheus/prometheus_test.go index 2a2e8c09568..a2ee600f4f6 100644 --- a/metricbeat/helper/prometheus/prometheus_test.go +++ b/metricbeat/helper/prometheus/prometheus_test.go @@ -36,12 +36,22 @@ const promMetrics = ` first_metric{label1="value1",label2="value2",label3="Value3",label4="FOO"} 1 # TYPE second_metric gauge second_metric{label1="value1",label3="othervalue"} 0 +# TYPE metrics_one_count_total counter +metrics_one_count_total{name="jane",surname="foster"} NaN +metrics_one_count_total{name="john",surname="williams"} +Inf +metrics_one_count_total{name="jahn",surname="baldwin",age="30"} 3 # TYPE summary_metric summary summary_metric{quantile="0.5"} 29735 summary_metric{quantile="0.9"} 47103 summary_metric{quantile="0.99"} 50681 summary_metric_sum 234892394 summary_metric_count 44000 +# TYPE summary_metric_with_nan_inf summary +summary_metric_with_nan_inf{quantile="0.5"} NaN +summary_metric_with_nan_inf{quantile="0.9"} +Inf +summary_metric_with_nan_inf{quantile="0.99"} 50681 +summary_metric_with_nan_inf_sum 50681 +summary_metric_with_nan_inf_count 44000 # TYPE histogram_metric histogram histogram_metric_bucket{le="1000"} 1 histogram_metric_bucket{le="10000"} 1 @@ -52,7 +62,16 @@ histogram_metric_bucket{le="1e+09"} 1 histogram_metric_bucket{le="+Inf"} 1 histogram_metric_sum 117 histogram_metric_count 1 - +# TYPE histogram_metric_with_nan_inf histogram +histogram_metric_with_nan_inf_bucket{le="1000"} NaN +histogram_metric_with_nan_inf_bucket{le="10000"} +Inf +histogram_metric_with_nan_inf_bucket{le="100000"} -Inf +histogram_metric_with_nan_inf_bucket{le="1e+06"} 1 +histogram_metric_with_nan_inf_bucket{le="1e+08"} 1 +histogram_metric_with_nan_inf_bucket{le="1e+09"} 1 +histogram_metric_with_nan_inf_bucket{le="+Inf"} 1 +histogram_metric_with_nan_inf_sum 117 +histogram_metric_with_nan_inf_count 1 ` type mockFetcher struct{} @@ -279,6 +298,33 @@ func TestPrometheus(t *testing.T) { }, expected: []common.MapStr{}, }, + { + msg: "Counter metric with NaN Inf", + mapping: &MetricsMapping{ + Metrics: map[string]MetricMap{ + "metrics_one_count_total": Metric("metrics.one.count"), + }, + Labels: map[string]LabelMap{ + "name": Label("metrics.one.labels.name"), + "surname": Label("metrics.one.labels.surname"), + "age": Label("metrics.one.labels.age"), + }, + }, + expected: []common.MapStr{ + common.MapStr{ + "metrics": common.MapStr{ + "one": common.MapStr{ + "count": int64(3.0), + "labels": common.MapStr{ + "name": "jahn", + "surname": "baldwin", + "age": "30", + }, + }, + }, + }, + }, + }, { msg: "Summary metric", mapping: &MetricsMapping{ @@ -302,6 +348,27 @@ func TestPrometheus(t *testing.T) { }, }, }, + { + msg: "Summary metric with NaN Inf", + mapping: &MetricsMapping{ + Metrics: map[string]MetricMap{ + "summary_metric_with_nan_inf": Metric("summary.metric"), + }, + }, + expected: []common.MapStr{ + common.MapStr{ + "summary": common.MapStr{ + "metric": common.MapStr{ + "sum": 50681.0, + "count": uint64(44000), + "percentile": common.MapStr{ + "99": 50681.0, + }, + }, + }, + }, + }, + }, { msg: "Histogram metric", mapping: &MetricsMapping{ @@ -329,6 +396,30 @@ func TestPrometheus(t *testing.T) { }, }, }, + { + msg: "Histogram metric with NaN Inf", + mapping: &MetricsMapping{ + Metrics: map[string]MetricMap{ + "histogram_metric_with_nan_inf": Metric("histogram.metric"), + }, + }, + expected: []common.MapStr{ + common.MapStr{ + "histogram": common.MapStr{ + "metric": common.MapStr{ + "count": uint64(1), + "bucket": common.MapStr{ + "1000000000": uint64(1), + "+Inf": uint64(1), + "1000000": uint64(1), + "100000000": uint64(1), + }, + "sum": 117.0, + }, + }, + }, + }, + }, } for _, test := range tests { diff --git a/metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain b/metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain new file mode 100644 index 00000000000..1101ede5d96 --- /dev/null +++ b/metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain @@ -0,0 +1,32 @@ +# HELP kafka_consumer_records_lag_records The latest lag of the partition +# TYPE kafka_consumer_records_lag_records gauge +kafka_consumer_records_lag_records{client_id="consumer1",} NaN +kafka_consumer_records_lag_records{client_id="consumer2",} +Inf +kafka_consumer_records_lag_records{client_id="consumer3",} -Inf +kafka_consumer_records_lag_records{client_id="consumer4",} 5 +# HELP http_failures Total number of http request failures +# TYPE http_failures counter +http_failures{method="GET"} 2 +http_failures{method="POST"} NaN +http_failures{method="DELETE"} +Inf +# HELP go_gc_duration_seconds A summary of the GC invocation durations. +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds{quantile="0",} NaN +go_gc_duration_seconds{quantile="0.25",} +Inf +go_gc_duration_seconds{quantile="0.5",} -Inf +go_gc_duration_seconds{quantile="0.75"} 9.8154e-05 +go_gc_duration_seconds{quantile="1",} 0.011689149 +go_gc_duration_seconds_sum 3.451780079 +go_gc_duration_seconds_count 13118 +# HELP http_request_duration_seconds request duration histogram +# TYPE http_request_duration_seconds histogram +http_request_duration_seconds_bucket{le="0.1"} +Inf +http_request_duration_seconds_bucket{le="0.2"} -Inf +http_request_duration_seconds_bucket{le="0.5"} NaN +http_request_duration_seconds_bucket{le="1"} 1 +http_request_duration_seconds_bucket{le="2"} 2 +http_request_duration_seconds_bucket{le="3"} 3 +http_request_duration_seconds_bucket{le="5"} 3 +http_request_duration_seconds_bucket{le="+Inf"} 3 +http_request_duration_seconds_sum 6 +http_request_duration_seconds_count 3 diff --git a/metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain-expected.json b/metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain-expected.json new file mode 100644 index 00000000000..60efae224fe --- /dev/null +++ b/metricbeat/module/prometheus/collector/_meta/testdata/metrics-with-naninf.plain-expected.json @@ -0,0 +1,222 @@ +[ + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "client_id": "consumer4" + }, + "metrics": { + "kafka_consumer_records_lag_records": 5 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "method": "GET" + }, + "metrics": { + "http_failures": 2 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "le": "5" + }, + "metrics": { + "http_request_duration_seconds_bucket": 3 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "le": "+Inf" + }, + "metrics": { + "http_request_duration_seconds_bucket": 3 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "le": "3" + }, + "metrics": { + "http_request_duration_seconds_bucket": 3 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "metrics": { + "go_gc_duration_seconds_count": 13118, + "go_gc_duration_seconds_sum": 3.451780079, + "http_request_duration_seconds_count": 3, + "http_request_duration_seconds_sum": 6 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "le": "2" + }, + "metrics": { + "http_request_duration_seconds_bucket": 2 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "le": "1" + }, + "metrics": { + "http_request_duration_seconds_bucket": 1 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "quantile": "1" + }, + "metrics": { + "go_gc_duration_seconds": 0.011689149 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + }, + { + "event": { + "dataset": "prometheus.collector", + "duration": 115000, + "module": "prometheus" + }, + "metricset": { + "name": "collector" + }, + "prometheus": { + "labels": { + "quantile": "0.75" + }, + "metrics": { + "go_gc_duration_seconds": 0.000098154 + } + }, + "service": { + "address": "127.0.0.1:55555", + "type": "prometheus" + } + } +] \ No newline at end of file diff --git a/metricbeat/module/prometheus/collector/data.go b/metricbeat/module/prometheus/collector/data.go index a2e191d3309..ad4e599ae31 100644 --- a/metricbeat/module/prometheus/collector/data.go +++ b/metricbeat/module/prometheus/collector/data.go @@ -55,36 +55,42 @@ func getPromEventsFromMetricFamily(mf *dto.MetricFamily) []PromEvent { counter := metric.GetCounter() if counter != nil { - events = append(events, PromEvent{ - data: common.MapStr{ - name: counter.GetValue(), - }, - labels: labels, - }) + if !math.IsNaN(counter.GetValue()) && !math.IsInf(counter.GetValue(), 0) { + events = append(events, PromEvent{ + data: common.MapStr{ + name: counter.GetValue(), + }, + labels: labels, + }) + } } gauge := metric.GetGauge() if gauge != nil { - events = append(events, PromEvent{ - data: common.MapStr{ - name: gauge.GetValue(), - }, - labels: labels, - }) + if !math.IsNaN(gauge.GetValue()) && !math.IsInf(gauge.GetValue(), 0) { + events = append(events, PromEvent{ + data: common.MapStr{ + name: gauge.GetValue(), + }, + labels: labels, + }) + } } summary := metric.GetSummary() if summary != nil { - events = append(events, PromEvent{ - data: common.MapStr{ - name + "_sum": summary.GetSampleSum(), - name + "_count": summary.GetSampleCount(), - }, - labels: labels, - }) + if !math.IsNaN(summary.GetSampleSum()) && !math.IsInf(summary.GetSampleSum(), 0) { + events = append(events, PromEvent{ + data: common.MapStr{ + name + "_sum": summary.GetSampleSum(), + name + "_count": summary.GetSampleCount(), + }, + labels: labels, + }) + } for _, quantile := range summary.GetQuantile() { - if math.IsNaN(quantile.GetValue()) { + if math.IsNaN(quantile.GetValue()) || math.IsInf(quantile.GetValue(), 0) { continue } @@ -101,15 +107,21 @@ func getPromEventsFromMetricFamily(mf *dto.MetricFamily) []PromEvent { histogram := metric.GetHistogram() if histogram != nil { - events = append(events, PromEvent{ - data: common.MapStr{ - name + "_sum": histogram.GetSampleSum(), - name + "_count": histogram.GetSampleCount(), - }, - labels: labels, - }) + if !math.IsNaN(histogram.GetSampleSum()) && !math.IsInf(histogram.GetSampleSum(), 0) { + events = append(events, PromEvent{ + data: common.MapStr{ + name + "_sum": histogram.GetSampleSum(), + name + "_count": histogram.GetSampleCount(), + }, + labels: labels, + }) + } for _, bucket := range histogram.GetBucket() { + if bucket.GetCumulativeCount() == uint64(math.NaN()) || bucket.GetCumulativeCount() == uint64(math.Inf(0)) { + continue + } + bucketLabels := labels.Clone() bucketLabels["le"] = strconv.FormatFloat(bucket.GetUpperBound(), 'f', -1, 64)