diff --git a/.changelog/12807.txt b/.changelog/12807.txt new file mode 100644 index 00000000000..af29d5e7c0e --- /dev/null +++ b/.changelog/12807.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0 +``` diff --git a/command/operator_debug.go b/command/operator_debug.go index dbb643a4912..6af31d21b3d 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -23,6 +23,7 @@ import ( "github.com/hashicorp/go-cleanhttp" "github.com/hashicorp/go-multierror" + goversion "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" "github.com/hashicorp/nomad/helper" @@ -54,14 +55,17 @@ type OperatorDebugCommand struct { cancel context.CancelFunc opts *api.QueryOptions verbose bool + members *api.ServerMembers + nodes []*api.NodeListStub } const ( - userAgent = "nomad operator debug" - clusterDir = "cluster" - clientDir = "client" - serverDir = "server" - intervalDir = "interval" + userAgent = "nomad operator debug" + clusterDir = "cluster" + clientDir = "client" + serverDir = "server" + intervalDir = "interval" + minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2" ) func (c *OperatorDebugCommand) Help() string { @@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int { AuthToken: c.Meta.token, } + // Get complete list of client nodes + c.nodes, _, err = client.Nodes().List(c.queryOpts()) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err)) + return 1 + } + + // Write nodes to file + c.writeJSON(clusterDir, "nodes.json", c.nodes, err) + // Search all nodes If a node class is specified without a list of node id prefixes if c.nodeClass != "" && nodeIDs == "" { nodeIDs = "all" @@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int { } // Resolve servers - members, err := client.Agent().MembersOpts(c.queryOpts()) + c.members, err = client.Agent().MembersOpts(c.queryOpts()) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err)) return 1 } // Write complete list of server members to file - c.writeJSON(clusterDir, "members.json", members, err) + c.writeJSON(clusterDir, "members.json", c.members, err) // Filter for servers matching criteria - c.serverIDs, err = filterServerMembers(members, serverIDs, c.region) + c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err)) return 1 @@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int { serversFound := 0 serverCaptureCount := 0 - if members != nil { - serversFound = len(members.Members) + if c.members != nil { + serversFound = len(c.members.Members) } if c.serverIDs != nil { serverCaptureCount = len(c.serverIDs) @@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { + pprofNodeIDs := []string{} + pprofServerIDs := []string{} + + // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions + for _, serverID := range c.serverIDs { + version := c.getNomadVersion(serverID, "") + err := checkVersion(version, minimumVersionPprofConstraint) + if err != nil { + c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err)) + } + pprofServerIDs = append(pprofServerIDs, serverID) + } + + for _, nodeID := range c.nodeIDs { + version := c.getNomadVersion("", nodeID) + err := checkVersion(version, minimumVersionPprofConstraint) + if err != nil { + c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err)) + } + pprofNodeIDs = append(pprofNodeIDs, nodeID) + } + // Take the first set of pprofs synchronously... c.Ui.Output(" Capture pprofInterval 0000") - c.collectPprofs(client, 0) + c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0) if c.pprofInterval == c.pprofDuration { return } @@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { return case <-timer.C: c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount)) - c.collectPprofs(client, pprofIntervalCount) + c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount) timer.Reset(c.pprofInterval) pprofIntervalCount++ } @@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { } // collectPprofs captures the /agent/pprof for each listed node -func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) { - for _, n := range c.nodeIDs { +func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) { + for _, n := range nodeIDs { c.collectPprof(clientDir, n, client, interval) } - for _, n := range c.serverIDs { + for _, n := range serverIDs { c.collectPprof(serverDir, n, client, interval) } } @@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads - - // This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable - // c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives - - // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable - // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes } // savePprofProfile retrieves a pprof profile and writes to disk @@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool { const redirectErr string = `invalid character '<' looking for beginning of value` return strings.Contains(err.Error(), redirectErr) } + +// getNomadVersion fetches the version of Nomad running on a given server/client node ID +func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string { + if serverID == "" && nodeID == "" { + return "" + } + + version := "" + if serverID != "" { + for _, server := range c.members.Members { + // Raft v2 server + if server.Name == serverID { + version = server.Tags["build"] + } + + // Raft v3 server + if server.Tags["id"] == serverID { + version = server.Tags["version"] + } + } + } + + if nodeID != "" { + for _, node := range c.nodes { + if node.ID == nodeID { + version = node.Version + } + } + } + + return version +} + +// checkVersion verifies that version satisfies the constraint +func checkVersion(version string, versionConstraint string) error { + v, err := goversion.NewVersion(version) + if err != nil { + return fmt.Errorf("error: %v", err) + } + + c, err := goversion.NewConstraint(versionConstraint) + if err != nil { + return fmt.Errorf("error: %v", err) + } + + if !c.Check(v) { + return nil + } + return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint) +} diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index c50d8f92252..04471ed7c80 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) { require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway } +// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are +// filtered by the version constraint. +func TestDebug_PprofVersionCheck(t *testing.T) { + cases := []struct { + version string + errMsg string + }{ + {"0.8.7", ""}, + {"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"}, + {"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"}, + {"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"}, + {"0.11.3", ""}, + {"0.11.3+ent", ""}, + {"0.12.0", ""}, + {"1.3.0", ""}, + {"foo.bar", "error: Malformed version: foo.bar"}, + } + + for _, tc := range cases { + t.Run(tc.version, func(t *testing.T) { + err := checkVersion(tc.version, minimumVersionPprofConstraint) + if tc.errMsg == "" { + require.NoError(t, err, "expected no error from %s", tc.version) + } else { + require.EqualError(t, err, tc.errMsg) + } + }) + } +} + func TestDebug_StringToSlice(t *testing.T) { ci.Parallel(t)