Skip to content

Commit

Permalink
Metrics gotemplate support, debug bundle features (#9067)
Browse files Browse the repository at this point in the history
* add goroutine text profiles to nomad operator debug

* add server-id=all to nomad operator debug

* fix bug from changing metrics from string to []byte

* Add function to return MetricsSummary struct, metrics gotemplate support

* fix bug resolving 'server-id=all' when no servers are available

* add url to operator_debug tests

* removed test section which is used for future operator_debug.go changes

* separate metrics from operator, use only structs from go-metrics

* ensure parent directories are created as needed

* add suggested comments for text debug pprof

* move check down to where it is used

* add WaitForFiles helper function to wait for multiple files to exist

* compact metrics check

Co-authored-by: Drew Bailey <[email protected]>

* fix github's silly apply suggestion

Co-authored-by: Drew Bailey <[email protected]>
  • Loading branch information
davemay99 and drewbailey authored Oct 14, 2020
1 parent b2fb40e commit 71a022a
Show file tree
Hide file tree
Showing 14 changed files with 398 additions and 87 deletions.
1 change: 1 addition & 0 deletions api/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ require (
github.com/kr/pretty v0.1.0
github.com/mitchellh/go-testing-interface v1.0.0
github.com/stretchr/testify v1.5.1
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
gopkg.in/yaml.v2 v2.2.8 // indirect
)
2 changes: 2 additions & 0 deletions api/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
Expand Down
19 changes: 0 additions & 19 deletions api/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,22 +304,3 @@ func (op *Operator) LicenseGet(q *QueryOptions) (*LicenseReply, *QueryMeta, erro
}
return &reply, qm, nil
}

// Metrics returns a slice of bytes containing metrics, optionally formatted as either json or prometheus
func (op *Operator) Metrics(q *QueryOptions) ([]byte, error) {
if q == nil {
q = &QueryOptions{}
}

metricsReader, err := op.c.rawQuery("/v1/metrics", q)
if err != nil {
return nil, err
}

metricsBytes, err := ioutil.ReadAll(metricsReader)
if err != nil {
return nil, err
}

return metricsBytes, nil
}
87 changes: 87 additions & 0 deletions api/operator_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package api

import (
"io/ioutil"
"time"
)

// MetricsSummary holds a roll-up of metrics info for a given interval
type MetricsSummary struct {
Timestamp string
Gauges []GaugeValue
Points []PointValue
Counters []SampledValue
Samples []SampledValue
}

type GaugeValue struct {
Name string
Hash string `json:"-"`
Value float32

Labels []Label `json:"-"`
DisplayLabels map[string]string `json:"Labels"`
}

type PointValue struct {
Name string
Points []float32
}

type SampledValue struct {
Name string
Hash string `json:"-"`
*AggregateSample
Mean float64
Stddev float64

Labels []Label `json:"-"`
DisplayLabels map[string]string `json:"Labels"`
}

// AggregateSample is used to hold aggregate metrics
// about a sample
type AggregateSample struct {
Count int // The count of emitted pairs
Rate float64 // The values rate per time unit (usually 1 second)
Sum float64 // The sum of values
SumSq float64 `json:"-"` // The sum of squared values
Min float64 // Minimum value
Max float64 // Maximum value
LastUpdated time.Time `json:"-"` // When value was last updated
}

type Label struct {
Name string
Value string
}

// Metrics returns a slice of bytes containing metrics, optionally formatted as either json or prometheus
func (op *Operator) Metrics(q *QueryOptions) ([]byte, error) {
if q == nil {
q = &QueryOptions{}
}

metricsReader, err := op.c.rawQuery("/v1/metrics", q)
if err != nil {
return nil, err
}

metricsBytes, err := ioutil.ReadAll(metricsReader)
if err != nil {
return nil, err
}

return metricsBytes, nil
}

// MetricsSummary returns a MetricsSummary struct and query metadata
func (op *Operator) MetricsSummary(q *QueryOptions) (*MetricsSummary, *QueryMeta, error) {
var resp *MetricsSummary
qm, err := op.c.query("/v1/metrics", &resp, q)
if err != nil {
return nil, nil, err
}

return resp, qm, nil
}
49 changes: 49 additions & 0 deletions api/operator_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package api

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestOperator_MetricsSummary(t *testing.T) {
t.Parallel()
c, s := makeClient(t, nil, nil)
defer s.Stop()

operator := c.Operator()
qo := &QueryOptions{
Params: map[string]string{
"pretty": "1",
},
}

metrics, qm, err := operator.MetricsSummary(qo)
require.NoError(t, err)
require.NotNil(t, metrics)
require.NotNil(t, qm)
require.NotNil(t, metrics.Timestamp) // should always get a TimeStamp
require.GreaterOrEqual(t, len(metrics.Points), 0) // may not have points yet
require.GreaterOrEqual(t, len(metrics.Gauges), 1) // should have at least 1 gauge
require.GreaterOrEqual(t, len(metrics.Counters), 1) // should have at least 1 counter
require.GreaterOrEqual(t, len(metrics.Samples), 1) // should have at least 1 sample
}

func TestOperator_Metrics_Prometheus(t *testing.T) {
t.Parallel()
c, s := makeClient(t, nil, nil)
defer s.Stop()

operator := c.Operator()
qo := &QueryOptions{
Params: map[string]string{
"format": "prometheus",
},
}

metrics, err := operator.Metrics(qo)
require.NoError(t, err)
require.NotNil(t, metrics)
metricString := string(metrics[:])
require.Containsf(t, metricString, "# HELP", "expected Prometheus format containing \"# HELP\", got: \n%s", metricString)
}
46 changes: 37 additions & 9 deletions command/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ Metrics Specific Options
-format <format>
Specify output format (prometheus)
-json
Output the allocation in its JSON format.
-t
Format and display allocation using a Go template.
`

return strings.TrimSpace(helpText)
Expand All @@ -42,19 +49,23 @@ func (c *OperatorMetricsCommand) AutocompleteFlags() complete.Flags {
complete.Flags{
"-pretty": complete.PredictAnything,
"-format": complete.PredictAnything,
"-json": complete.PredictNothing,
"-t": complete.PredictAnything,
})
}

func (c *OperatorMetricsCommand) Name() string { return "metrics" }

func (c *OperatorMetricsCommand) Run(args []string) int {
var pretty bool
var format string
var pretty, json bool
var format, tmpl string

flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.BoolVar(&pretty, "pretty", false, "")
flags.StringVar(&format, "format", "", "")
flags.BoolVar(&json, "json", false, "")
flags.StringVar(&tmpl, "t", "", "")

if err := flags.Parse(args); err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing flags: %s", err))
Expand Down Expand Up @@ -88,14 +99,31 @@ func (c *OperatorMetricsCommand) Run(args []string) int {
Params: params,
}

bs, err := client.Operator().Metrics(query)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error getting metrics: %v", err))
return 1
if json || len(tmpl) > 0 {
metrics, _, err := client.Operator().MetricsSummary(query)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying metrics: %v", err))
return 1
}

out, err := Format(json, tmpl, metrics)
if err != nil {
c.Ui.Error(err.Error())
return 1
}

c.Ui.Output(out)
return 0
} else {
bs, err := client.Operator().Metrics(query)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error getting metrics: %v", err))
return 1
}

resp := string(bs[:])
c.Ui.Output(resp)
}

resp := string(bs[:])
c.Ui.Output(resp)

return 0
}
14 changes: 14 additions & 0 deletions command/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@ func TestCommand_Metrics_Cases(t *testing.T) {
expectedOutput string
expectedError string
}{
{
"gotemplate MetricsSummary",
[]string{"-address=" + url, "-t", "'{{ .Timestamp }}'"},
0,
"UTC",
"",
},
{
"json formatted MetricsSummary",
[]string{"-address=" + url, "-json"},
0,
"{",
"",
},
{
"pretty print json",
[]string{"-address=" + url, "-pretty"},
Expand Down
70 changes: 60 additions & 10 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ Debug Options:
profiles. Accepts id prefixes.
-server-id=<server>,<server>
Comma separated list of Nomad server names, or "leader" to monitor for logs and include pprof
Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof
profiles.
-stale=<true|false>
Expand Down Expand Up @@ -251,9 +251,25 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
}

// Resolve server prefixes
for _, id := range argNodes(serverIDs) {
c.serverIDs = append(c.serverIDs, id)
// Resolve servers
members, err := client.Agent().Members()
c.writeJSON("version", "members.json", members, err)
// We always write the error to the file, but don't range if no members found
if serverIDs == "all" && members != nil {
// Special case to capture from all servers
for _, member := range members.Members {
c.serverIDs = append(c.serverIDs, member.Name)
}
} else {
for _, id := range argNodes(serverIDs) {
c.serverIDs = append(c.serverIDs, id)
}
}

// Return error if servers were specified but not found
if len(serverIDs) > 0 && len(c.serverIDs) == 0 {
c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs))
return 1
}

c.manifest = make([]string, 0)
Expand All @@ -267,6 +283,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
stamped := "nomad-debug-" + c.timestamp

c.Ui.Output("Starting debugger and capturing cluster data...")
c.Ui.Output(fmt.Sprintf("Capturing from servers: %v", c.serverIDs))
c.Ui.Output(fmt.Sprintf("Capturing from client nodes: %v", c.nodeIDs))

c.Ui.Output(fmt.Sprintf(" Interval: '%s'", interval))
c.Ui.Output(fmt.Sprintf(" Duration: '%s'", duration))
Expand Down Expand Up @@ -499,6 +517,23 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
if err == nil {
c.writeBytes(path, "goroutine.prof", bs)
}

// Gather goroutine text output - debug type 1
// debug type 1 writes the legacy text format for human readable output
opts.Debug = 1
bs, err = client.Agent().Lookup("goroutine", opts, nil)
if err == nil {
c.writeBytes(path, "goroutine-debug1.txt", bs)
}

// Gather goroutine text output - debug type 2
// When printing the "goroutine" profile, debug=2 means to print the goroutine
// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
opts.Debug = 2
bs, err = client.Agent().Lookup("goroutine", opts, nil)
if err == nil {
c.writeBytes(path, "goroutine-debug2.txt", bs)
}
}

// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops
Expand Down Expand Up @@ -576,8 +611,11 @@ func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) erro
vs, _, err := client.CSIVolumes().List(qo)
c.writeJSON(dir, "volumes.json", vs, err)

metrics, err := client.Operator().Metrics(qo)
c.writeJSON(dir, "metrics.json", metrics, err)
if metricBytes, err := client.Operator().Metrics(qo); err != nil {
c.writeError(dir, "metrics.json", err)
} else {
c.writeBytes(dir, "metrics.json", metricBytes)
}

return nil
}
Expand Down Expand Up @@ -628,12 +666,24 @@ func (c *OperatorDebugCommand) collectVault(dir, vault string) error {

// writeBytes writes a file to the archive, recording it in the manifest
func (c *OperatorDebugCommand) writeBytes(dir, file string, data []byte) error {
path := filepath.Join(dir, file)
c.manifest = append(c.manifest, path)
path = filepath.Join(c.collectDir, path)
relativePath := filepath.Join(dir, file)
c.manifest = append(c.manifest, relativePath)
dirPath := filepath.Join(c.collectDir, dir)
filePath := filepath.Join(dirPath, file)

// Ensure parent directories exist
err := os.MkdirAll(dirPath, os.ModePerm)
if err != nil {
// Display error immediately -- may not see this if files aren't written
c.Ui.Error(fmt.Sprintf("failed to create parent directories of \"%s\": %s", dirPath, err.Error()))
return err
}

fh, err := os.Create(path)
// Create the file
fh, err := os.Create(filePath)
if err != nil {
// Display error immediately -- may not see this if files aren't written
c.Ui.Error(fmt.Sprintf("failed to create file \"%s\": %s", filePath, err.Error()))
return err
}
defer fh.Close()
Expand Down
Loading

0 comments on commit 71a022a

Please sign in to comment.