Skip to content

Commit

Permalink
Merge #25343
Browse files Browse the repository at this point in the history
25343: server: add health checks and distress gossip r=bdarnell a=tschottdorf

While generating NodeStatus summaries, check for metrics that indicate a
severe store, node, or cluster-level problem. Metrics can either be
counters or gauges, and for the former we have to keep state so that we
can notify only when the counter increments (not when it's nonzero).

Flagged metrics are gossiped under a newly introduced key. These infos
are in turn picked up by the newly introduced internal `gossip_alerts`
table.

In effect, operators can monitor the `crdb_internal.gossip_alerts`
table on any node (though they'll want to do it on all nodes if there
are network connectivity issues). Similarly, it'll be straightforward
to plumb these warnings into the UI, though to the best of my knowledge
the UI can't just query `crdb_internal`, and we may expose them in
another more suitable location (this is trivial since Gossip is
available pretty much everywhere).

For starters, we only check the metrics for underreplicated and
unavailable ranges as well as liveness errors, but there is no
limitation on how elaborate these health checks can become. In fact,
they aren't limited to sourcing solely from `NodeStatus`, and in light
of #25316 we should consider alerting when nodes can't (bidirectionally)
communicate, so that operators can easily diagnose DNS or firewall
issues.

NB: I had originally envisioned polling the `node_metrics` table because
that would have allowed us to write the health checks in SQL. I had this
code but ultimately deleted it as it seemed too roundabout and less
extensible.

Release note: None

Co-authored-by: Tobias Schottdorf <[email protected]>
  • Loading branch information
craig[bot] and tbg committed May 9, 2018
2 parents ebc0957 + d74e8a1 commit 096f614
Show file tree
Hide file tree
Showing 21 changed files with 1,050 additions and 184 deletions.
14 changes: 12 additions & 2 deletions pkg/gossip/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ const (
// string address of the node. E.g. node:1 => 127.0.0.1:24001
KeyNodeIDPrefix = "node"

// KeyHealthAlertPrefix is the key prefix for gossiping health alerts. The
// value is a proto of type HealthCheckResult.
KeyNodeHealthAlertPrefix = "health-alert"

// KeyNodeLivenessPrefix is the key prefix for gossiping node liveness info.
KeyNodeLivenessPrefix = "liveness"

Expand Down Expand Up @@ -112,8 +116,8 @@ func IsNodeIDKey(key string) bool {
return strings.HasPrefix(key, KeyNodeIDPrefix+separator)
}

// NodeIDFromKey attempts to extract a NodeID from the provided key.
// The key should have been constructed by MakeNodeIDKey.
// NodeIDFromKey attempts to extract a NodeID from the provided key. The key
// should have been constructed by MakeNodeIDKey or MakeNodeHealthAlertKey.
// Returns an error if the key is not of the correct type or is not parsable.
func NodeIDFromKey(key string) (roachpb.NodeID, error) {
trimmedKey, err := removePrefixFromKey(key, KeyNodeIDPrefix)
Expand All @@ -127,6 +131,12 @@ func NodeIDFromKey(key string) (roachpb.NodeID, error) {
return roachpb.NodeID(nodeID), nil
}

// MakeNodeHealthAlertKey returns the gossip key under which the given node can
// gossip health alerts.
func MakeNodeHealthAlertKey(nodeID roachpb.NodeID) string {
return MakeKey(KeyNodeHealthAlertPrefix, strconv.Itoa(int(nodeID)))
}

// MakeNodeLivenessKey returns the gossip key for node liveness info.
func MakeNodeLivenessKey(nodeID roachpb.NodeID) string {
return MakeKey(KeyNodeLivenessPrefix, nodeID.String())
Expand Down
38 changes: 30 additions & 8 deletions pkg/server/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ func (n *Node) bootstrapStores(
}
// write a new status summary after all stores have been bootstrapped; this
// helps the UI remain responsive when new nodes are added.
if err := n.writeSummaries(ctx); err != nil {
if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil {
log.Warningf(ctx, "error writing node summary after store bootstrap: %s", err)
}
}
Expand Down Expand Up @@ -758,23 +758,27 @@ func (n *Node) computePeriodicMetrics(ctx context.Context, tick int) error {
})
}

// startWriteSummaries begins periodically persisting status summaries for the
// startWriteNodeStatus begins periodically persisting status summaries for the
// node and its stores.
func (n *Node) startWriteSummaries(frequency time.Duration) {
func (n *Node) startWriteNodeStatus(frequency time.Duration) {
ctx := log.WithLogTag(n.AnnotateCtx(context.Background()), "summaries", nil)
// Immediately record summaries once on server startup.
n.stopper.RunWorker(ctx, func(ctx context.Context) {
// Write a status summary immediately; this helps the UI remain
// responsive when new nodes are added.
if err := n.writeSummaries(ctx); err != nil {
if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil {
log.Warningf(ctx, "error recording initial status summaries: %s", err)
}
ticker := time.NewTicker(frequency)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := n.writeSummaries(ctx); err != nil {
// Use an alertTTL of twice the ticker frequency. This makes sure that
// alerts don't disappear and reappear spuriously while at the same
// time ensuring that an alert doesn't linger for too long after having
// resolved.
if err := n.writeNodeStatus(ctx, 2*frequency); err != nil {
log.Warningf(ctx, "error recording status summaries: %s", err)
}
case <-n.stopper.ShouldStop():
Expand All @@ -784,12 +788,30 @@ func (n *Node) startWriteSummaries(frequency time.Duration) {
})
}

// writeSummaries retrieves status summaries from the supplied
// writeNodeStatus retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (n *Node) writeSummaries(ctx context.Context) error {
func (n *Node) writeNodeStatus(ctx context.Context, alertTTL time.Duration) error {
var err error
if runErr := n.stopper.RunTask(ctx, "node.Node: writing summary", func(ctx context.Context) {
err = n.recorder.WriteStatusSummary(ctx, n.storeCfg.DB)
nodeStatus := n.recorder.GenerateNodeStatus(ctx)
if nodeStatus == nil {
return
}

if result := n.recorder.CheckHealth(ctx, *nodeStatus); len(result.Alerts) != 0 {
log.Warningf(ctx, "health alerts detected: %+v", result)
if err := n.storeCfg.Gossip.AddInfoProto(
gossip.MakeNodeHealthAlertKey(n.Descriptor.NodeID), &result, alertTTL,
); err != nil {
log.Warningf(ctx, "unable to gossip health alerts: %+v", result)
}

// TODO(tschottdorf): add a metric that we increment every time there are
// alerts. This can help understand how long the cluster has been in that
// state (since it'll be incremented every ~10s).
}

err = n.recorder.WriteNodeStatus(ctx, n.storeCfg.DB, *nodeStatus)
}); runErr != nil {
err = runErr
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/server/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,9 +524,9 @@ func compareNodeStatus(
return nodeStatus
}

// TestStatusSummaries verifies that status summaries are written correctly for
// TestNodeStatusWritten verifies that status summaries are written correctly for
// both the Node and stores within the node.
func TestStatusSummaries(t *testing.T) {
func TestNodeStatusWritten(t *testing.T) {
defer leaktest.AfterTest(t)()

// ========================================
Expand Down
2 changes: 1 addition & 1 deletion pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,7 @@ If problems persist, please see ` + base.DocsURL("cluster-setup-troubleshooting.
)

// Begin recording status summaries.
s.node.startWriteSummaries(DefaultMetricsSampleInterval)
s.node.startWriteNodeStatus(DefaultMetricsSampleInterval)

// Create and start the schema change manager only after a NodeID
// has been assigned.
Expand Down
41 changes: 41 additions & 0 deletions pkg/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/status"
"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/engine"
Expand Down Expand Up @@ -70,6 +71,46 @@ func TestSelfBootstrap(t *testing.T) {
}
}

// TestHealthCheck runs a basic sanity check on the health checker.
func TestHealthCheck(t *testing.T) {
defer leaktest.AfterTest(t)()
s, err := serverutils.StartServerRaw(base.TestServerArgs{})
if err != nil {
t.Fatal(err)
}
defer s.Stopper().Stop(context.TODO())

ctx := context.Background()

recorder := s.(*TestServer).Server.recorder

{
summary := *recorder.GenerateNodeStatus(ctx)
result := recorder.CheckHealth(ctx, summary)
if len(result.Alerts) != 0 {
t.Fatal(result)
}
}

store, err := s.(*TestServer).Server.node.stores.GetStore(1)
if err != nil {
t.Fatal(err)
}

store.Metrics().UnavailableRangeCount.Inc(100)

{
summary := *recorder.GenerateNodeStatus(ctx)
result := recorder.CheckHealth(ctx, summary)
expAlerts := []status.HealthAlert{
{StoreID: 1, Category: status.HealthAlert_METRICS, Description: "ranges.unavailable", Value: 100.0},
}
if !reflect.DeepEqual(expAlerts, result.Alerts) {
t.Fatalf("expected %+v, got %+v", expAlerts, result.Alerts)
}
}
}

// TestServerStartClock tests that a server's clock is not pushed out of thin
// air. This used to happen - the simple act of starting was causing a server's
// clock to be pushed because we were introducing bogus future timestamps into
Expand Down
166 changes: 166 additions & 0 deletions pkg/server/status/health_check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package status

import (
"context"

"time"

"github.com/cockroachdb/cockroach/pkg/roachpb"
)

type threshold struct {
gauge bool
min int64
}

var (
counterZero = threshold{}
gaugeZero = threshold{gauge: true}
)

// TODO(tschottdorf): I think we should just export the metric metadata from
// their respective packages and reference them here, instead of the
// duplication. It also seems useful to specify the metric type in the metadata
// so that we don't have to "guess" whether it's a gauge or counter. However
// there's some massaging for latency histograms that happens in NodeStatus,
// so the logic likely has to be moved up a bit. A thread not worth pulling on
// at the moment, I suppose.
//
// TODO(tschottdorf): there are some metrics that could be used in alerts but
// need special treatment. For example, we want to alert when compactions are
// queued but not processed over long periods of time, or when queues have a
// large backlog but show no sign of processing times.
var trackedMetrics = map[string]threshold{
// Gauges.
"ranges.unavailable": gaugeZero,
"ranges.underreplicated": gaugeZero,
"requests.backpressure.split": gaugeZero,
"requests.slow.commandqueue": gaugeZero,
"requests.slow.distsender": gaugeZero,
"requests.slow.lease": gaugeZero,
"requests.slow.raft": gaugeZero,
"sys.goroutines": {gauge: true, min: 5000},

// Latencies (which are really histograms, but we get to see a fixed number
// of percentiles as gauges)
"raft.process.logcommit.latency-90": {gauge: true, min: int64(500 * time.Millisecond)},
"round-trip-latency-p90": {gauge: true, min: int64(time.Second)},

// Counters.

"liveness.heartbeatfailures": counterZero,
"timeseries.write.errors": counterZero,

// Queue processing errors. This might be too aggressive. For example, if the
// replicate queue is waiting for a split, does that generate an error? If so,
// is that worth alerting about? We might need severities here at some point
// or some other way to guard against "blips".
"compactor.compactions.failure": counterZero,
"queue.replicagc.process.failure": counterZero,
"queue.raftlog.process.failure": counterZero,
"queue.gc.process.failure": counterZero,
"queue.split.process.failure": counterZero,
"queue.replicate.process.failure": counterZero,
"queue.raftsnapshot.process.failure": counterZero,
"queue.tsmaintenance.process.failure": counterZero,
"queue.consistency.process.failure": counterZero,
}

type metricsMap map[roachpb.StoreID]map[string]float64

// update takes a populated metrics map and extracts the tracked metrics. Gauges
// are returned verbatim, while for counters the diff between the last seen
// value is returned. Only nonzero values are reported and the seen (non-relative)
// values are persisted for the next call.
func (d metricsMap) update(tracked map[string]threshold, m metricsMap) metricsMap {
out := metricsMap{}
for storeID := range m {
for name, threshold := range tracked {
val, ok := m[storeID][name]
if !ok {
continue
}

if !threshold.gauge {
prevVal, havePrev := d[storeID][name]
if d[storeID] == nil {
d[storeID] = map[string]float64{}
}
d[storeID][name] = val
if havePrev {
val -= prevVal
} else {
// Can't report the first time around if we don't know the previous
// value of the counter.
val = 0
}
}

if val > float64(threshold.min) {
if out[storeID] == nil {
out[storeID] = map[string]float64{}
}
out[storeID][name] = val
}
}
}
return out
}

// A HealthChecker inspects the node metrics and optionally a NodeStatus for
// anomalous conditions that the operator should be alerted to.
type HealthChecker struct {
state metricsMap
tracked map[string]threshold
}

// NewHealthChecker creates a new health checker that emits alerts whenever the
// given metrics are nonzero. Setting the boolean map value indicates a gauge
// (in which case it is reported whenever it's nonzero); otherwise the metric is
// treated as a counter and reports whenever it is incremented between
// consecutive calls of `CheckHealth`.
func NewHealthChecker(trackedMetrics map[string]threshold) *HealthChecker {
return &HealthChecker{state: metricsMap{}, tracked: trackedMetrics}
}

// CheckHealth performs a (cheap) health check. It is not thread safe.
func (h *HealthChecker) CheckHealth(ctx context.Context, nodeStatus NodeStatus) HealthCheckResult {
// Gauges that trigger alerts when nonzero.
var alerts []HealthAlert

m := map[roachpb.StoreID]map[string]float64{
0: nodeStatus.Metrics,
}
for _, storeStatus := range nodeStatus.StoreStatuses {
m[storeStatus.Desc.StoreID] = storeStatus.Metrics
}

diffs := h.state.update(h.tracked, m)

for storeID, storeDiff := range diffs {
for name, value := range storeDiff {
alerts = append(alerts, HealthAlert{
StoreID: storeID,
Category: HealthAlert_METRICS,
Description: name,
Value: value,
})
}
}

return HealthCheckResult{Alerts: alerts}
}
Loading

0 comments on commit 096f614

Please sign in to comment.