From 34548c3e1b247d093ba0caf460106821b498bdbb Mon Sep 17 00:00:00 2001 From: VihasMakwana <121151420+VihasMakwana@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:24:57 +0530 Subject: [PATCH] [metricbeat][system/process, system/process_summary]: mark module as healthy if metrics are partially filled (#40565) * chore: mark module as healthy if metrics are partially filled * chore: mark module as healthy if metrics are partially filled * fix: use errors.As * fix: fix lint * Update metricbeat/mb/event.go Co-authored-by: Mauri de Souza Meneguzzo * fix: changelog --------- Co-authored-by: Mauri de Souza Meneguzzo (cherry picked from commit 1f033c9eed2ccd5b5acca830b47ec3c2261ef026) # Conflicts: # metricbeat/mb/module/wrapper.go # metricbeat/module/system/process_summary/process_summary.go --- CHANGELOG.next.asciidoc | 7 +++++++ metricbeat/mb/event.go | 15 +++++++++++++ metricbeat/mb/module/wrapper.go | 21 +++++++++++++++++++ metricbeat/module/system/process/process.go | 2 ++ .../system/process_summary/process_summary.go | 9 ++++++++ 5 files changed, 54 insertions(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 4dd7d83a032..7cbb5c2cbe5 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -39,6 +39,13 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Metricbeat* - Setting period for counter cache for Prometheus remote_write at least to 60sec {pull}38553[38553] +- Add support of Graphite series 1.1.0+ tagging extension for statsd module. {pull}39619[39619] +- Allow metricsets to report their status via control v2 protocol. {pull}40025[40025] +- Remove fallback to the node limit for the `kubernetes.pod.cpu.usage.limit.pct` and `kubernetes.pod.memory.usage.limit.pct` metrics calculation +- Add support for Kibana status metricset in v8 format {pull}40275[40275] +- Add new metrics for the vSphere Datastore metricset. {pull}40441[40441] +- Update metrics for the vSphere Host metricset. {pull}40429[40429] +- Mark system process metricsets as running if metrics are partially available {pull}40565[40565] - Added back `elasticsearch.node.stats.jvm.mem.pools.*` to the `node_stats` metricset {pull}40571[40571] *Osquerybeat* diff --git a/metricbeat/mb/event.go b/metricbeat/mb/event.go index 98430732ef4..fb6907b6396 100644 --- a/metricbeat/mb/event.go +++ b/metricbeat/mb/event.go @@ -214,3 +214,18 @@ func tryToMapStr(v interface{}) (mapstr.M, bool) { return nil, false } } + +// PartialMetricsError indicates that metrics are only partially filled. +// This will be removed once we fix the underlying problem. +// See https://github.com/elastic/beats/issues/40542 for more details. +type PartialMetricsError struct { + Err error +} + +func (p PartialMetricsError) Error() string { + return p.Err.Error() +} + +func (p PartialMetricsError) Unwrap() error { + return p.Err +} diff --git a/metricbeat/mb/module/wrapper.go b/metricbeat/mb/module/wrapper.go index d41bdf01497..b6fc76e298c 100644 --- a/metricbeat/mb/module/wrapper.go +++ b/metricbeat/mb/module/wrapper.go @@ -19,6 +19,7 @@ package module import ( "context" + "errors" "fmt" "math/rand" "sync" @@ -253,6 +254,16 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(reporter.V2()) if err != nil { reporter.V2().Error(err) +<<<<<<< HEAD +======= + if errors.As(err, &mb.PartialMetricsError{}) { + // mark module as running if metrics are partially available and display the error message + msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } else { + // mark it as degraded for any other issue encountered + msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } +>>>>>>> 1f033c9eed ([metricbeat][system/process, system/process_summary]: mark module as healthy if metrics are partially filled (#40565)) logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err) } case mb.ReportingMetricSetV2WithContext: @@ -260,6 +271,16 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(ctx, reporter.V2()) if err != nil { reporter.V2().Error(err) +<<<<<<< HEAD +======= + if errors.As(err, &mb.PartialMetricsError{}) { + // mark module as running if metrics are partially available and display the error message + msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } else { + // mark it as degraded for any other issue encountered + msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } +>>>>>>> 1f033c9eed ([metricbeat][system/process, system/process_summary]: mark module as healthy if metrics are partially filled (#40565)) logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err) } default: diff --git a/metricbeat/module/system/process/process.go b/metricbeat/module/system/process/process.go index ad9fa8d5ac0..5693913ec1d 100644 --- a/metricbeat/module/system/process/process.go +++ b/metricbeat/module/system/process/process.go @@ -113,6 +113,8 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { procs, roots, err := m.stats.Get() if err != nil { return fmt.Errorf("process stats: %w", err) + } else if (err != nil && errors.Is(err, process.NonFatalErr{})) { + err = mb.PartialMetricsError{Err: err} } for evtI := range procs { diff --git a/metricbeat/module/system/process_summary/process_summary.go b/metricbeat/module/system/process_summary/process_summary.go index c64a0c1d3e1..e6729fabd04 100644 --- a/metricbeat/module/system/process_summary/process_summary.go +++ b/metricbeat/module/system/process_summary/process_summary.go @@ -68,9 +68,18 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // descriptive error must be returned. func (m *MetricSet) Fetch(r mb.ReporterV2) error { +<<<<<<< HEAD procList, err := process.ListStates(m.sys) if err != nil { return fmt.Errorf("error fetching process list: %w", err) +======= + procList, degradeErr := process.ListStates(m.sys) + if degradeErr != nil && !errors.Is(degradeErr, process.NonFatalErr{}) { + // return only if the error is fatal in nature + return fmt.Errorf("error fetching process list: %w", degradeErr) + } else if (degradeErr != nil && errors.Is(degradeErr, process.NonFatalErr{})) { + degradeErr = mb.PartialMetricsError{Err: degradeErr} +>>>>>>> 1f033c9eed ([metricbeat][system/process, system/process_summary]: mark module as healthy if metrics are partially filled (#40565)) } procStates := map[string]int{}