Skip to content

Commit

Permalink
storage: Add a "node is alive" metric
Browse files Browse the repository at this point in the history
The idea being that you would aggregate this metric across your nodes to
determine how many healthy nodes you have at any given time, while still
maintaining the ability to check each individual node's health.
  • Loading branch information
a-robinson committed Dec 15, 2016
1 parent f28fab4 commit 754161e
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
65 changes: 57 additions & 8 deletions pkg/storage/node_liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,22 @@ var (

// Node liveness metrics counter names.
var (
metaHeartbeatSuccesses = metric.Metadata{Name: "liveness.heartbeatsuccesses"}
metaHeartbeatFailures = metric.Metadata{Name: "liveness.heartbeatfailures"}
metaEpochIncrements = metric.Metadata{Name: "liveness.epochincrements"}
metaLiveNodes = metric.Metadata{
Name: "liveness.livenodes",
Help: "Number of live nodes in the cluster (will be 0 if this node is not itself live)",
}
metaHeartbeatSuccesses = metric.Metadata{
Name: "liveness.heartbeatsuccesses",
Help: "Number of successful node liveness heartbeats from this node",
}
metaHeartbeatFailures = metric.Metadata{
Name: "liveness.heartbeatfailures",
Help: "Number of failed node liveness heartbeats from this node",
}
metaEpochIncrements = metric.Metadata{
Name: "liveness.epochincrements",
Help: "Number of times this node has incremented its liveness epoch",
}
)

func (l *Liveness) isLive(now hlc.Timestamp, maxOffset time.Duration) bool {
Expand All @@ -60,6 +73,7 @@ func (l *Liveness) isLive(now hlc.Timestamp, maxOffset time.Duration) bool {

// LivenessMetrics holds metrics for use with node liveness activity.
type LivenessMetrics struct {
LiveNodes *metric.Gauge
HeartbeatSuccesses *metric.Counter
HeartbeatFailures *metric.Counter
EpochIncrements *metric.Counter
Expand Down Expand Up @@ -106,11 +120,12 @@ func NewNodeLiveness(
gossip: g,
livenessThreshold: livenessThreshold,
heartbeatInterval: livenessThreshold - renewalDuration,
metrics: LivenessMetrics{
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures),
EpochIncrements: metric.NewCounter(metaEpochIncrements),
},
}
nl.metrics = LivenessMetrics{
LiveNodes: metric.NewFunctionalGauge(metaLiveNodes, nl.numLiveNodes),
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures),
EpochIncrements: metric.NewCounter(metaEpochIncrements),
}
nl.pauseHeartbeat.Store(false)
nl.mu.nodes = map[roachpb.NodeID]Liveness{}
Expand Down Expand Up @@ -396,3 +411,37 @@ func (nl *NodeLiveness) livenessGossipUpdate(key string, content roachpb.Value)
nl.mu.nodes[liveness.NodeID] = liveness
}
}

// numLiveNodes is used to populate a metric that tracks the number of live
// nodes in the cluster. Returns 0 if this node is not itself live, to avoid
// reporting potentially inaccurate data.
// We export this metric from every live node rather than a single particular
// live node because liveness information is gossiped and thus may be stale.
// That staleness could result in no nodes reporting the metric or multiple
// nodes reporting the metric, so it's simplest to just have all live nodes
// report it.
func (nl *NodeLiveness) numLiveNodes() int64 {
selfID := nl.gossip.NodeID.Get()
if selfID == 0 {
return 0
}

nl.mu.Lock()
defer nl.mu.Unlock()

// If this node isn't live, we don't want to report its view of node liveness
// because it's more likely to be inaccurate than the view of a live node.
now := nl.clock.Now()
maxOffset := nl.clock.MaxOffset()
if !nl.mu.self.isLive(now, maxOffset) {
return 0
}

var liveNodes int64
for _, l := range nl.mu.nodes {
if l.isLive(now, maxOffset) {
liveNodes++
}
}
return liveNodes
}
13 changes: 12 additions & 1 deletion pkg/storage/node_liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (

func verifyLiveness(t *testing.T, mtc *multiTestContext) {
testutils.SucceedsSoon(t, func() error {
for _, nl := range mtc.nodeLivenesses {
for i, nl := range mtc.nodeLivenesses {
for _, g := range mtc.gossips {
live, err := nl.IsLive(g.NodeID.Get())
if err != nil {
Expand All @@ -44,6 +44,10 @@ func verifyLiveness(t *testing.T, mtc *multiTestContext) {
return errors.Errorf("node %d not live", g.NodeID.Get())
}
}
if a, e := nl.Metrics().LiveNodes.Value(), int64(len(mtc.nodeLivenesses)); a != e {
return errors.Errorf("expected node %d's LiveNodes metric to be %d; got %d",
mtc.gossips[i].NodeID.Get(), e, a)
}
}
return nil
})
Expand Down Expand Up @@ -75,6 +79,13 @@ func TestNodeLiveness(t *testing.T) {
} else if live {
t.Errorf("expected node %d to be considered not-live after advancing node clock", nodeID)
}
testutils.SucceedsSoon(t, func() error {
if a, e := nl.Metrics().LiveNodes.Value(), int64(0); a != e {
return errors.Errorf("expected node %d's LiveNodes metric to be %d; got %d",
nodeID, e, a)
}
return nil
})
}
// Trigger a manual heartbeat and verify liveness is reestablished.
for _, nl := range mtc.nodeLivenesses {
Expand Down

0 comments on commit 754161e

Please sign in to comment.