From da14b593ff19cfcde15dfae9bc47922b7dd14b0d Mon Sep 17 00:00:00 2001 From: Morten Urban Date: Wed, 4 May 2022 16:15:50 +0200 Subject: [PATCH 1/2] fix(parsers.nagios): metrics will always return a supported status code, error is now in service_output if needed #11061 --- plugins/inputs/exec/exec.go | 9 ++--- plugins/parsers/nagios/parser.go | 41 ++++++++++++------- plugins/parsers/nagios/parser_test.go | 58 +++++++++++++++++---------- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/plugins/inputs/exec/exec.go b/plugins/inputs/exec/exec.go index 1e42bd3053332..3369979eafb55 100644 --- a/plugins/inputs/exec/exec.go +++ b/plugins/inputs/exec/exec.go @@ -127,9 +127,9 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync defer wg.Done() _, isNagios := e.parser.(*nagios.NagiosParser) - out, errbuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout)) + out, errBuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout)) if !isNagios && runErr != nil { - err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errbuf)) + err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errBuf)) acc.AddError(err) return } @@ -141,10 +141,7 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync } if isNagios { - metrics, err = nagios.TryAddState(runErr, metrics) - if err != nil { - e.Log.Errorf("Failed to add nagios state: %s", err) - } + metrics = nagios.AddState(runErr, errBuf, metrics) } for _, m := range metrics { diff --git a/plugins/parsers/nagios/parser.go b/plugins/parsers/nagios/parser.go index f45e07a82eb1d..ca50a97f5b5e5 100644 --- a/plugins/parsers/nagios/parser.go +++ b/plugins/parsers/nagios/parser.go @@ -4,7 +4,6 @@ import ( "bufio" "bytes" "errors" - "fmt" "os/exec" "regexp" "strconv" @@ -25,10 +24,7 @@ func getExitCode(err error) (int, error) { ee, ok := err.(*exec.ExitError) if !ok { - // If it is not an *exec.ExitError, then it must be - // an io error, but docs do not say anything about the - // exit code in this case. - return 0, err + return 3, err } ws, ok := ee.Sys().(syscall.WaitStatus) @@ -39,19 +35,35 @@ func getExitCode(err error) (int, error) { return ws.ExitStatus(), nil } -// TryAddState attempts to add a state derived from the runErr. -// If any error occurs, it is guaranteed to be returned along with -// the initial metric slice. -func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, error) { - state, err := getExitCode(runErr) - if err != nil { - return metrics, fmt.Errorf("exec: get exit code: %s", err) +// AddState adds a state derived from the runErr. Unknown state will be set as fallback. +// If any error occurs, it is guaranteed to be added to the service output. +// An updated slice of metrics will be returned. +func AddState(runErr error, errMessage []byte, metrics []telegraf.Metric) []telegraf.Metric { + state, exitErr := getExitCode(runErr) + // This will ensure that in every error case the valid nagios state 'unknown' will be returned. + // No error needs to be thrown because the output will contain the error information. + // Description found at 'Plugin Return Codes' https://nagios-plugins.org/doc/guidelines.html + if exitErr != nil || state < 0 || state > 3 { + state = 3 } for _, m := range metrics { if m.Name() == "nagios_state" { m.AddField("state", state) - return metrics, nil + + if state == 3 { + errorMessage := string(errMessage) + if exitErr != nil && exitErr.Error() != "" { + errorMessage = exitErr.Error() + } + value, ok := m.GetField("service_output") + if !ok || value == "" { + // By adding the error message as output, the metric contains all needed information to understand + // the problem and fix it + m.AddField("service_output", errorMessage) + } + } + return metrics } } @@ -66,8 +78,7 @@ func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, er } m := metric.New("nagios_state", nil, f, ts) - metrics = append(metrics, m) - return metrics, nil + return append(metrics, m) } type NagiosParser struct { diff --git a/plugins/parsers/nagios/parser_test.go b/plugins/parsers/nagios/parser_test.go index 48cfce241d173..79a54ffa7a76f 100644 --- a/plugins/parsers/nagios/parser_test.go +++ b/plugins/parsers/nagios/parser_test.go @@ -32,7 +32,7 @@ func TestGetExitCode(t *testing.T) { errF: func() error { return errors.New("I am not *exec.ExitError") }, - expCode: 0, + expCode: 3, expErr: errors.New("I am not *exec.ExitError"), }, } @@ -89,10 +89,11 @@ func assertEqual(t *testing.T, exp, actual []telegraf.Metric) { func TestTryAddState(t *testing.T) { tests := []struct { - name string - runErrF func() error - metrics []telegraf.Metric - assertF func(*testing.T, []telegraf.Metric, error) + name string + runErrF func() error + runErrMessage []byte + metrics []telegraf.Metric + assertF func(*testing.T, []telegraf.Metric) }{ { name: "should append state=0 field to existing metric", @@ -107,7 +108,7 @@ func TestTryAddState(t *testing.T) { n("nagios_state"). f("service_output", "OK: system working").b(), }, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { exp := []telegraf.Metric{ mb(). n("nagios"). @@ -118,7 +119,6 @@ func TestTryAddState(t *testing.T) { f("state", 0).b(), } assertEqual(t, exp, metrics) - require.NoError(t, err) }, }, { @@ -131,7 +131,7 @@ func TestTryAddState(t *testing.T) { n("nagios"). f("perfdata", 0).b(), }, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { exp := []telegraf.Metric{ mb(). n("nagios"). @@ -141,7 +141,6 @@ func TestTryAddState(t *testing.T) { f("state", 0).b(), } assertEqual(t, exp, metrics) - require.NoError(t, err) }, }, { @@ -150,7 +149,7 @@ func TestTryAddState(t *testing.T) { return nil }, metrics: []telegraf.Metric{}, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { require.Len(t, metrics, 1) m := metrics[0] require.Equal(t, "nagios_state", m.Name()) @@ -158,37 +157,54 @@ func TestTryAddState(t *testing.T) { require.True(t, ok) require.Equal(t, int64(0), s) require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second) - require.NoError(t, err) }, }, { - name: "should return original metrics and an error", + name: "should return metrics with state unknown and thrown error is service_output", runErrF: func() error { return errors.New("non parsable error") }, metrics: []telegraf.Metric{ mb(). - n("nagios"). - f("perfdata", 0).b(), + n("nagios_state").b(), }, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { exp := []telegraf.Metric{ mb(). - n("nagios"). - f("perfdata", 0).b(), + n("nagios_state"). + f("state", 3). + f("service_output", "non parsable error").b(), } - expErr := "exec: get exit code: non parsable error" assertEqual(t, exp, metrics) - require.Equal(t, expErr, err.Error()) + }, + }, + { + name: "should return metrics with state unknown and service_output error from error message parameter", + runErrF: func() error { + return errors.New("") + }, + runErrMessage: []byte("some error message"), + metrics: []telegraf.Metric{ + mb(). + n("nagios_state").b(), + }, + assertF: func(t *testing.T, metrics []telegraf.Metric) { + exp := []telegraf.Metric{ + mb(). + n("nagios_state"). + f("state", 3). + f("service_output", "some error message").b(), + } + assertEqual(t, exp, metrics) }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - metrics, err := TryAddState(tt.runErrF(), tt.metrics) - tt.assertF(t, metrics, err) + metrics := AddState(tt.runErrF(), tt.runErrMessage, tt.metrics) + tt.assertF(t, metrics) }) } } From a57b0aabe2eb7358431e7c22f09de41b872a8b11 Mon Sep 17 00:00:00 2001 From: Morten Urban Date: Tue, 17 May 2022 09:32:38 +0200 Subject: [PATCH 2/2] fix(parsers.nagios): converted bare 3 into unknown status code const --- plugins/parsers/nagios/parser.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/plugins/parsers/nagios/parser.go b/plugins/parsers/nagios/parser.go index ca50a97f5b5e5..22aa5c0dff7a1 100644 --- a/plugins/parsers/nagios/parser.go +++ b/plugins/parsers/nagios/parser.go @@ -15,6 +15,10 @@ import ( "github.com/influxdata/telegraf/metric" ) +// unknownExitCode is the nagios unknown status code +// the exit code should be used if an error occurs or something unexpected happens +const unknownExitCode = 3 + // getExitCode get the exit code from an error value which is the result // of running a command through exec package api. func getExitCode(err error) (int, error) { @@ -24,7 +28,7 @@ func getExitCode(err error) (int, error) { ee, ok := err.(*exec.ExitError) if !ok { - return 3, err + return unknownExitCode, err } ws, ok := ee.Sys().(syscall.WaitStatus) @@ -43,15 +47,15 @@ func AddState(runErr error, errMessage []byte, metrics []telegraf.Metric) []tele // This will ensure that in every error case the valid nagios state 'unknown' will be returned. // No error needs to be thrown because the output will contain the error information. // Description found at 'Plugin Return Codes' https://nagios-plugins.org/doc/guidelines.html - if exitErr != nil || state < 0 || state > 3 { - state = 3 + if exitErr != nil || state < 0 || state > unknownExitCode { + state = unknownExitCode } for _, m := range metrics { if m.Name() == "nagios_state" { m.AddField("state", state) - if state == 3 { + if state == unknownExitCode { errorMessage := string(errMessage) if exitErr != nil && exitErr.Error() != "" { errorMessage = exitErr.Error()