Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(parsers/nagios): metrics will always return a supported status co… #11062

Merged
merged 2 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions plugins/inputs/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync
defer wg.Done()
_, isNagios := e.parser.(*nagios.NagiosParser)

out, errbuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout))
out, errBuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout))
if !isNagios && runErr != nil {
err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errbuf))
err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errBuf))
acc.AddError(err)
return
}
Expand All @@ -141,10 +141,7 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync
}

if isNagios {
metrics, err = nagios.TryAddState(runErr, metrics)
if err != nil {
e.Log.Errorf("Failed to add nagios state: %s", err)
}
metrics = nagios.AddState(runErr, errBuf, metrics)
}

for _, m := range metrics {
Expand Down
45 changes: 30 additions & 15 deletions plugins/parsers/nagios/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bufio"
"bytes"
"errors"
"fmt"
"os/exec"
"regexp"
"strconv"
Expand All @@ -16,6 +15,10 @@ import (
"github.com/influxdata/telegraf/metric"
)

// unknownExitCode is the nagios unknown status code
// the exit code should be used if an error occurs or something unexpected happens
const unknownExitCode = 3

// getExitCode get the exit code from an error value which is the result
// of running a command through exec package api.
func getExitCode(err error) (int, error) {
Expand All @@ -25,10 +28,7 @@ func getExitCode(err error) (int, error) {

ee, ok := err.(*exec.ExitError)
if !ok {
// If it is not an *exec.ExitError, then it must be
// an io error, but docs do not say anything about the
// exit code in this case.
return 0, err
return unknownExitCode, err
}

ws, ok := ee.Sys().(syscall.WaitStatus)
Expand All @@ -39,19 +39,35 @@ func getExitCode(err error) (int, error) {
return ws.ExitStatus(), nil
}

// TryAddState attempts to add a state derived from the runErr.
// If any error occurs, it is guaranteed to be returned along with
// the initial metric slice.
func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, error) {
state, err := getExitCode(runErr)
if err != nil {
return metrics, fmt.Errorf("exec: get exit code: %s", err)
// AddState adds a state derived from the runErr. Unknown state will be set as fallback.
// If any error occurs, it is guaranteed to be added to the service output.
// An updated slice of metrics will be returned.
func AddState(runErr error, errMessage []byte, metrics []telegraf.Metric) []telegraf.Metric {
state, exitErr := getExitCode(runErr)
// This will ensure that in every error case the valid nagios state 'unknown' will be returned.
// No error needs to be thrown because the output will contain the error information.
// Description found at 'Plugin Return Codes' https://nagios-plugins.org/doc/guidelines.html
if exitErr != nil || state < 0 || state > unknownExitCode {
state = unknownExitCode
}

for _, m := range metrics {
if m.Name() == "nagios_state" {
m.AddField("state", state)
return metrics, nil

if state == unknownExitCode {
errorMessage := string(errMessage)
if exitErr != nil && exitErr.Error() != "" {
errorMessage = exitErr.Error()
}
value, ok := m.GetField("service_output")
if !ok || value == "" {
// By adding the error message as output, the metric contains all needed information to understand
// the problem and fix it
m.AddField("service_output", errorMessage)
}
}
return metrics
}
}

Expand All @@ -66,8 +82,7 @@ func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, er
}
m := metric.New("nagios_state", nil, f, ts)

metrics = append(metrics, m)
return metrics, nil
return append(metrics, m)
}

type NagiosParser struct {
Expand Down
58 changes: 37 additions & 21 deletions plugins/parsers/nagios/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func TestGetExitCode(t *testing.T) {
errF: func() error {
return errors.New("I am not *exec.ExitError")
},
expCode: 0,
expCode: 3,
expErr: errors.New("I am not *exec.ExitError"),
},
}
Expand Down Expand Up @@ -89,10 +89,11 @@ func assertEqual(t *testing.T, exp, actual []telegraf.Metric) {

func TestTryAddState(t *testing.T) {
tests := []struct {
name string
runErrF func() error
metrics []telegraf.Metric
assertF func(*testing.T, []telegraf.Metric, error)
name string
runErrF func() error
runErrMessage []byte
metrics []telegraf.Metric
assertF func(*testing.T, []telegraf.Metric)
}{
{
name: "should append state=0 field to existing metric",
Expand All @@ -107,7 +108,7 @@ func TestTryAddState(t *testing.T) {
n("nagios_state").
f("service_output", "OK: system working").b(),
},
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{
mb().
n("nagios").
Expand All @@ -118,7 +119,6 @@ func TestTryAddState(t *testing.T) {
f("state", 0).b(),
}
assertEqual(t, exp, metrics)
require.NoError(t, err)
},
},
{
Expand All @@ -131,7 +131,7 @@ func TestTryAddState(t *testing.T) {
n("nagios").
f("perfdata", 0).b(),
},
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{
mb().
n("nagios").
Expand All @@ -141,7 +141,6 @@ func TestTryAddState(t *testing.T) {
f("state", 0).b(),
}
assertEqual(t, exp, metrics)
require.NoError(t, err)
},
},
{
Expand All @@ -150,45 +149,62 @@ func TestTryAddState(t *testing.T) {
return nil
},
metrics: []telegraf.Metric{},
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
assertF: func(t *testing.T, metrics []telegraf.Metric) {
require.Len(t, metrics, 1)
m := metrics[0]
require.Equal(t, "nagios_state", m.Name())
s, ok := m.GetField("state")
require.True(t, ok)
require.Equal(t, int64(0), s)
require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second)
require.NoError(t, err)
},
},
{
name: "should return original metrics and an error",
name: "should return metrics with state unknown and thrown error is service_output",
runErrF: func() error {
return errors.New("non parsable error")
},
metrics: []telegraf.Metric{
mb().
n("nagios").
f("perfdata", 0).b(),
n("nagios_state").b(),
},
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{
mb().
n("nagios").
f("perfdata", 0).b(),
n("nagios_state").
f("state", 3).
f("service_output", "non parsable error").b(),
}
expErr := "exec: get exit code: non parsable error"

assertEqual(t, exp, metrics)
require.Equal(t, expErr, err.Error())
},
},
{
name: "should return metrics with state unknown and service_output error from error message parameter",
runErrF: func() error {
return errors.New("")
},
runErrMessage: []byte("some error message"),
metrics: []telegraf.Metric{
mb().
n("nagios_state").b(),
},
assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{
mb().
n("nagios_state").
f("state", 3).
f("service_output", "some error message").b(),
}
assertEqual(t, exp, metrics)
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
metrics, err := TryAddState(tt.runErrF(), tt.metrics)
tt.assertF(t, metrics, err)
metrics := AddState(tt.runErrF(), tt.runErrMessage, tt.metrics)
tt.assertF(t, metrics)
})
}
}
Expand Down