Merge #25343

25343: server: add health checks and distress gossip r=bdarnell a=tschottdorf While generating NodeStatus summaries, check for metrics that indicate a severe store, node, or cluster-level problem. Metrics can either be counters or gauges, and for the former we have to keep state so that we can notify only when the counter increments (not when it's nonzero). Flagged metrics are gossiped under a newly introduced key. These infos are in turn picked up by the newly introduced internal `gossip_alerts` table. In effect, operators can monitor the `crdb_internal.gossip_alerts` table on any node (though they'll want to do it on all nodes if there are network connectivity issues). Similarly, it'll be straightforward to plumb these warnings into the UI, though to the best of my knowledge the UI can't just query `crdb_internal`, and we may expose them in another more suitable location (this is trivial since Gossip is available pretty much everywhere). For starters, we only check the metrics for underreplicated and unavailable ranges as well as liveness errors, but there is no limitation on how elaborate these health checks can become. In fact, they aren't limited to sourcing solely from `NodeStatus`, and in light of #25316 we should consider alerting when nodes can't (bidirectionally) communicate, so that operators can easily diagnose DNS or firewall issues. NB: I had originally envisioned polling the `node_metrics` table because that would have allowed us to write the health checks in SQL. I had this code but ultimately deleted it as it seemed too roundabout and less extensible. Release note: None Co-authored-by: Tobias Schottdorf <[email protected]>
cockroachdb · May 9, 2018 · 096f614 · 096f614
2 parents ebc0957 + d74e8a1
commit 096f614
Show file tree

Hide file tree

Showing 21 changed files with 1,050 additions and 184 deletions.
diff --git a/pkg/gossip/keys.go b/pkg/gossip/keys.go
@@ -53,6 +53,10 @@ const (
 	// string address of the node. E.g. node:1 => 127.0.0.1:24001
 	KeyNodeIDPrefix = "node"
 
+	// KeyHealthAlertPrefix is the key prefix for gossiping health alerts. The
+	// value is a proto of type HealthCheckResult.
+	KeyNodeHealthAlertPrefix = "health-alert"
+
 	// KeyNodeLivenessPrefix is the key prefix for gossiping node liveness info.
 	KeyNodeLivenessPrefix = "liveness"
 
@@ -112,8 +116,8 @@ func IsNodeIDKey(key string) bool {
 	return strings.HasPrefix(key, KeyNodeIDPrefix+separator)
 }
 
-// NodeIDFromKey attempts to extract a NodeID from the provided key.
-// The key should have been constructed by MakeNodeIDKey.
+// NodeIDFromKey attempts to extract a NodeID from the provided key. The key
+// should have been constructed by MakeNodeIDKey or MakeNodeHealthAlertKey.
 // Returns an error if the key is not of the correct type or is not parsable.
 func NodeIDFromKey(key string) (roachpb.NodeID, error) {
 	trimmedKey, err := removePrefixFromKey(key, KeyNodeIDPrefix)
@@ -127,6 +131,12 @@ func NodeIDFromKey(key string) (roachpb.NodeID, error) {
 	return roachpb.NodeID(nodeID), nil
 }
 
+// MakeNodeHealthAlertKey returns the gossip key under which the given node can
+// gossip health alerts.
+func MakeNodeHealthAlertKey(nodeID roachpb.NodeID) string {
+	return MakeKey(KeyNodeHealthAlertPrefix, strconv.Itoa(int(nodeID)))
+}
+
 // MakeNodeLivenessKey returns the gossip key for node liveness info.
 func MakeNodeLivenessKey(nodeID roachpb.NodeID) string {
 	return MakeKey(KeyNodeLivenessPrefix, nodeID.String())

diff --git a/pkg/server/node.go b/pkg/server/node.go
@@ -634,7 +634,7 @@ func (n *Node) bootstrapStores(
 	}
 	// write a new status summary after all stores have been bootstrapped; this
 	// helps the UI remain responsive when new nodes are added.
-	if err := n.writeSummaries(ctx); err != nil {
+	if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil {
 		log.Warningf(ctx, "error writing node summary after store bootstrap: %s", err)
 	}
 }
@@ -758,23 +758,27 @@ func (n *Node) computePeriodicMetrics(ctx context.Context, tick int) error {
 	})
 }
 
-// startWriteSummaries begins periodically persisting status summaries for the
+// startWriteNodeStatus begins periodically persisting status summaries for the
 // node and its stores.
-func (n *Node) startWriteSummaries(frequency time.Duration) {
+func (n *Node) startWriteNodeStatus(frequency time.Duration) {
 	ctx := log.WithLogTag(n.AnnotateCtx(context.Background()), "summaries", nil)
 	// Immediately record summaries once on server startup.
 	n.stopper.RunWorker(ctx, func(ctx context.Context) {
 		// Write a status summary immediately; this helps the UI remain
 		// responsive when new nodes are added.
-		if err := n.writeSummaries(ctx); err != nil {
+		if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil {
 			log.Warningf(ctx, "error recording initial status summaries: %s", err)
 		}
 		ticker := time.NewTicker(frequency)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-ticker.C:
-				if err := n.writeSummaries(ctx); err != nil {
+				// Use an alertTTL of twice the ticker frequency. This makes sure that
+				// alerts don't disappear and reappear spuriously while at the same
+				// time ensuring that an alert doesn't linger for too long after having
+				// resolved.
+				if err := n.writeNodeStatus(ctx, 2*frequency); err != nil {
 					log.Warningf(ctx, "error recording status summaries: %s", err)
 				}
 			case <-n.stopper.ShouldStop():
@@ -784,12 +788,30 @@ func (n *Node) startWriteSummaries(frequency time.Duration) {
 	})
 }
 
-// writeSummaries retrieves status summaries from the supplied
+// writeNodeStatus retrieves status summaries from the supplied
 // NodeStatusRecorder and persists them to the cockroach data store.
-func (n *Node) writeSummaries(ctx context.Context) error {
+func (n *Node) writeNodeStatus(ctx context.Context, alertTTL time.Duration) error {
 	var err error
 	if runErr := n.stopper.RunTask(ctx, "node.Node: writing summary", func(ctx context.Context) {
-		err = n.recorder.WriteStatusSummary(ctx, n.storeCfg.DB)
+		nodeStatus := n.recorder.GenerateNodeStatus(ctx)
+		if nodeStatus == nil {
+			return
+		}
+
+		if result := n.recorder.CheckHealth(ctx, *nodeStatus); len(result.Alerts) != 0 {
+			log.Warningf(ctx, "health alerts detected: %+v", result)
+			if err := n.storeCfg.Gossip.AddInfoProto(
+				gossip.MakeNodeHealthAlertKey(n.Descriptor.NodeID), &result, alertTTL,
+			); err != nil {
+				log.Warningf(ctx, "unable to gossip health alerts: %+v", result)
+			}
+
+			// TODO(tschottdorf): add a metric that we increment every time there are
+			// alerts. This can help understand how long the cluster has been in that
+			// state (since it'll be incremented every ~10s).
+		}
+
+		err = n.recorder.WriteNodeStatus(ctx, n.storeCfg.DB, *nodeStatus)
 	}); runErr != nil {
 		err = runErr
 	}

diff --git a/pkg/server/node_test.go b/pkg/server/node_test.go
@@ -524,9 +524,9 @@ func compareNodeStatus(
 	return nodeStatus
 }
 
-// TestStatusSummaries verifies that status summaries are written correctly for
+// TestNodeStatusWritten verifies that status summaries are written correctly for
 // both the Node and stores within the node.
-func TestStatusSummaries(t *testing.T) {
+func TestNodeStatusWritten(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
 	// ========================================

diff --git a/pkg/server/server.go b/pkg/server/server.go
@@ -1440,7 +1440,7 @@ If problems persist, please see ` + base.DocsURL("cluster-setup-troubleshooting.
 	)
 
 	// Begin recording status summaries.
-	s.node.startWriteSummaries(DefaultMetricsSampleInterval)
+	s.node.startWriteNodeStatus(DefaultMetricsSampleInterval)
 
 	// Create and start the schema change manager only after a NodeID
 	// has been assigned.

diff --git a/pkg/server/server_test.go b/pkg/server/server_test.go
@@ -36,6 +36,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/keys"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
+	"github.com/cockroachdb/cockroach/pkg/server/status"
 	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
 	"github.com/cockroachdb/cockroach/pkg/storage"
 	"github.com/cockroachdb/cockroach/pkg/storage/engine"
@@ -70,6 +71,46 @@ func TestSelfBootstrap(t *testing.T) {
 	}
 }
 
+// TestHealthCheck runs a basic sanity check on the health checker.
+func TestHealthCheck(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	s, err := serverutils.StartServerRaw(base.TestServerArgs{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer s.Stopper().Stop(context.TODO())
+
+	ctx := context.Background()
+
+	recorder := s.(*TestServer).Server.recorder
+
+	{
+		summary := *recorder.GenerateNodeStatus(ctx)
+		result := recorder.CheckHealth(ctx, summary)
+		if len(result.Alerts) != 0 {
+			t.Fatal(result)
+		}
+	}
+
+	store, err := s.(*TestServer).Server.node.stores.GetStore(1)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	store.Metrics().UnavailableRangeCount.Inc(100)
+
+	{
+		summary := *recorder.GenerateNodeStatus(ctx)
+		result := recorder.CheckHealth(ctx, summary)
+		expAlerts := []status.HealthAlert{
+			{StoreID: 1, Category: status.HealthAlert_METRICS, Description: "ranges.unavailable", Value: 100.0},
+		}
+		if !reflect.DeepEqual(expAlerts, result.Alerts) {
+			t.Fatalf("expected %+v, got %+v", expAlerts, result.Alerts)
+		}
+	}
+}
+
 // TestServerStartClock tests that a server's clock is not pushed out of thin
 // air. This used to happen - the simple act of starting was causing a server's
 // clock to be pushed because we were introducing bogus future timestamps into

diff --git a/pkg/server/status/health_check.go b/pkg/server/status/health_check.go
@@ -0,0 +1,166 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package status
+
+import (
+	"context"
+
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/roachpb"
+)
+
+type threshold struct {
+	gauge bool
+	min   int64
+}
+
+var (
+	counterZero = threshold{}
+	gaugeZero   = threshold{gauge: true}
+)
+
+// TODO(tschottdorf): I think we should just export the metric metadata from
+// their respective packages and reference them here, instead of the
+// duplication. It also seems useful to specify the metric type in the metadata
+// so that we don't have to "guess" whether it's a gauge or counter. However
+// there's some massaging for latency histograms that happens in NodeStatus,
+// so the logic likely has to be moved up a bit. A thread not worth pulling on
+// at the moment, I suppose.
+//
+// TODO(tschottdorf): there are some metrics that could be used in alerts but
+// need special treatment. For example, we want to alert when compactions are
+// queued but not processed over long periods of time, or when queues have a
+// large backlog but show no sign of processing times.
+var trackedMetrics = map[string]threshold{
+	// Gauges.
+	"ranges.unavailable":          gaugeZero,
+	"ranges.underreplicated":      gaugeZero,
+	"requests.backpressure.split": gaugeZero,
+	"requests.slow.commandqueue":  gaugeZero,
+	"requests.slow.distsender":    gaugeZero,
+	"requests.slow.lease":         gaugeZero,
+	"requests.slow.raft":          gaugeZero,
+	"sys.goroutines":              {gauge: true, min: 5000},
+
+	// Latencies (which are really histograms, but we get to see a fixed number
+	// of percentiles as gauges)
+	"raft.process.logcommit.latency-90": {gauge: true, min: int64(500 * time.Millisecond)},
+	"round-trip-latency-p90":            {gauge: true, min: int64(time.Second)},
+
+	// Counters.
+
+	"liveness.heartbeatfailures": counterZero,
+	"timeseries.write.errors":    counterZero,
+
+	// Queue processing errors. This might be too aggressive. For example, if the
+	// replicate queue is waiting for a split, does that generate an error? If so,
+	// is that worth alerting about? We might need severities here at some point
+	// or some other way to guard against "blips".
+	"compactor.compactions.failure":       counterZero,
+	"queue.replicagc.process.failure":     counterZero,
+	"queue.raftlog.process.failure":       counterZero,
+	"queue.gc.process.failure":            counterZero,
+	"queue.split.process.failure":         counterZero,
+	"queue.replicate.process.failure":     counterZero,
+	"queue.raftsnapshot.process.failure":  counterZero,
+	"queue.tsmaintenance.process.failure": counterZero,
+	"queue.consistency.process.failure":   counterZero,
+}
+
+type metricsMap map[roachpb.StoreID]map[string]float64
+
+// update takes a populated metrics map and extracts the tracked metrics. Gauges
+// are returned verbatim, while for counters the diff between the last seen
+// value is returned. Only nonzero values are reported and the seen (non-relative)
+// values are persisted for the next call.
+func (d metricsMap) update(tracked map[string]threshold, m metricsMap) metricsMap {
+	out := metricsMap{}
+	for storeID := range m {
+		for name, threshold := range tracked {
+			val, ok := m[storeID][name]
+			if !ok {
+				continue
+			}
+
+			if !threshold.gauge {
+				prevVal, havePrev := d[storeID][name]
+				if d[storeID] == nil {
+					d[storeID] = map[string]float64{}
+				}
+				d[storeID][name] = val
+				if havePrev {
+					val -= prevVal
+				} else {
+					// Can't report the first time around if we don't know the previous
+					// value of the counter.
+					val = 0
+				}
+			}
+
+			if val > float64(threshold.min) {
+				if out[storeID] == nil {
+					out[storeID] = map[string]float64{}
+				}
+				out[storeID][name] = val
+			}
+		}
+	}
+	return out
+}
+
+// A HealthChecker inspects the node metrics and optionally a NodeStatus for
+// anomalous conditions that the operator should be alerted to.
+type HealthChecker struct {
+	state   metricsMap
+	tracked map[string]threshold
+}
+
+// NewHealthChecker creates a new health checker that emits alerts whenever the
+// given metrics are nonzero. Setting the boolean map value indicates a gauge
+// (in which case it is reported whenever it's nonzero); otherwise the metric is
+// treated as a counter and reports whenever it is incremented between
+// consecutive calls of `CheckHealth`.
+func NewHealthChecker(trackedMetrics map[string]threshold) *HealthChecker {
+	return &HealthChecker{state: metricsMap{}, tracked: trackedMetrics}
+}
+
+// CheckHealth performs a (cheap) health check. It is not thread safe.
+func (h *HealthChecker) CheckHealth(ctx context.Context, nodeStatus NodeStatus) HealthCheckResult {
+	// Gauges that trigger alerts when nonzero.
+	var alerts []HealthAlert
+
+	m := map[roachpb.StoreID]map[string]float64{
+		0: nodeStatus.Metrics,
+	}
+	for _, storeStatus := range nodeStatus.StoreStatuses {
+		m[storeStatus.Desc.StoreID] = storeStatus.Metrics
+	}
+
+	diffs := h.state.update(h.tracked, m)
+
+	for storeID, storeDiff := range diffs {
+		for name, value := range storeDiff {
+			alerts = append(alerts, HealthAlert{
+				StoreID:     storeID,
+				Category:    HealthAlert_METRICS,
+				Description: name,
+				Value:       value,
+			})
+		}
+	}
+
+	return HealthCheckResult{Alerts: alerts}
+}