Skip to content

Commit

Permalink
Merge pull request cockroachdb#12296 from a-robinson/liveness
Browse files Browse the repository at this point in the history
storage: Add a "node is alive" metric
  • Loading branch information
a-robinson authored Dec 16, 2016
2 parents 35267cc + 754161e commit 43ca188
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 10 deletions.
65 changes: 57 additions & 8 deletions pkg/storage/node_liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,22 @@ var (

// Node liveness metrics counter names.
var (
metaHeartbeatSuccesses = metric.Metadata{Name: "liveness.heartbeatsuccesses"}
metaHeartbeatFailures = metric.Metadata{Name: "liveness.heartbeatfailures"}
metaEpochIncrements = metric.Metadata{Name: "liveness.epochincrements"}
metaLiveNodes = metric.Metadata{
Name: "liveness.livenodes",
Help: "Number of live nodes in the cluster (will be 0 if this node is not itself live)",
}
metaHeartbeatSuccesses = metric.Metadata{
Name: "liveness.heartbeatsuccesses",
Help: "Number of successful node liveness heartbeats from this node",
}
metaHeartbeatFailures = metric.Metadata{
Name: "liveness.heartbeatfailures",
Help: "Number of failed node liveness heartbeats from this node",
}
metaEpochIncrements = metric.Metadata{
Name: "liveness.epochincrements",
Help: "Number of times this node has incremented its liveness epoch",
}
)

func (l *Liveness) isLive(now hlc.Timestamp, maxOffset time.Duration) bool {
Expand All @@ -60,6 +73,7 @@ func (l *Liveness) isLive(now hlc.Timestamp, maxOffset time.Duration) bool {

// LivenessMetrics holds metrics for use with node liveness activity.
type LivenessMetrics struct {
LiveNodes *metric.Gauge
HeartbeatSuccesses *metric.Counter
HeartbeatFailures *metric.Counter
EpochIncrements *metric.Counter
Expand Down Expand Up @@ -106,11 +120,12 @@ func NewNodeLiveness(
gossip: g,
livenessThreshold: livenessThreshold,
heartbeatInterval: livenessThreshold - renewalDuration,
metrics: LivenessMetrics{
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures),
EpochIncrements: metric.NewCounter(metaEpochIncrements),
},
}
nl.metrics = LivenessMetrics{
LiveNodes: metric.NewFunctionalGauge(metaLiveNodes, nl.numLiveNodes),
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures),
EpochIncrements: metric.NewCounter(metaEpochIncrements),
}
nl.pauseHeartbeat.Store(false)
nl.mu.nodes = map[roachpb.NodeID]Liveness{}
Expand Down Expand Up @@ -396,3 +411,37 @@ func (nl *NodeLiveness) livenessGossipUpdate(key string, content roachpb.Value)
nl.mu.nodes[liveness.NodeID] = liveness
}
}

// numLiveNodes is used to populate a metric that tracks the number of live
// nodes in the cluster. Returns 0 if this node is not itself live, to avoid
// reporting potentially inaccurate data.
// We export this metric from every live node rather than a single particular
// live node because liveness information is gossiped and thus may be stale.
// That staleness could result in no nodes reporting the metric or multiple
// nodes reporting the metric, so it's simplest to just have all live nodes
// report it.
func (nl *NodeLiveness) numLiveNodes() int64 {
selfID := nl.gossip.NodeID.Get()
if selfID == 0 {
return 0
}

nl.mu.Lock()
defer nl.mu.Unlock()

// If this node isn't live, we don't want to report its view of node liveness
// because it's more likely to be inaccurate than the view of a live node.
now := nl.clock.Now()
maxOffset := nl.clock.MaxOffset()
if !nl.mu.self.isLive(now, maxOffset) {
return 0
}

var liveNodes int64
for _, l := range nl.mu.nodes {
if l.isLive(now, maxOffset) {
liveNodes++
}
}
return liveNodes
}
13 changes: 12 additions & 1 deletion pkg/storage/node_liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (

func verifyLiveness(t *testing.T, mtc *multiTestContext) {
testutils.SucceedsSoon(t, func() error {
for _, nl := range mtc.nodeLivenesses {
for i, nl := range mtc.nodeLivenesses {
for _, g := range mtc.gossips {
live, err := nl.IsLive(g.NodeID.Get())
if err != nil {
Expand All @@ -44,6 +44,10 @@ func verifyLiveness(t *testing.T, mtc *multiTestContext) {
return errors.Errorf("node %d not live", g.NodeID.Get())
}
}
if a, e := nl.Metrics().LiveNodes.Value(), int64(len(mtc.nodeLivenesses)); a != e {
return errors.Errorf("expected node %d's LiveNodes metric to be %d; got %d",
mtc.gossips[i].NodeID.Get(), e, a)
}
}
return nil
})
Expand Down Expand Up @@ -75,6 +79,13 @@ func TestNodeLiveness(t *testing.T) {
} else if live {
t.Errorf("expected node %d to be considered not-live after advancing node clock", nodeID)
}
testutils.SucceedsSoon(t, func() error {
if a, e := nl.Metrics().LiveNodes.Value(), int64(0); a != e {
return errors.Errorf("expected node %d's LiveNodes metric to be %d; got %d",
nodeID, e, a)
}
return nil
})
}
// Trigger a manual heartbeat and verify liveness is reestablished.
for _, nl := range mtc.nodeLivenesses {
Expand Down
14 changes: 13 additions & 1 deletion pkg/util/metric/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,20 @@ func (c *Counter) ToPrometheusMetric() *prometheusgo.Metric {
type Gauge struct {
Metadata
value *int64
fn func() int64
}

// NewGauge creates a Gauge.
func NewGauge(metadata Metadata) *Gauge {
return &Gauge{metadata, new(int64)}
return &Gauge{metadata, new(int64), nil}
}

// NewFunctionalGauge creates a Gauge metric whose value is determined when
// asked for by calling the provided function.
// Note that Update, Inc, and Dec should NOT be called on a Gauge returned
// from NewFunctionalGauge.
func NewFunctionalGauge(metadata Metadata, f func() int64) *Gauge {
return &Gauge{metadata, nil, f}
}

// Snapshot returns a read-only copy of the gauge.
Expand All @@ -342,6 +351,9 @@ func (g *Gauge) Update(v int64) {

// Value returns the gauge's current value.
func (g *Gauge) Value() int64 {
if g.fn != nil {
return g.fn()
}
return atomic.LoadInt64(g.value)
}

Expand Down
12 changes: 12 additions & 0 deletions pkg/util/metric/metric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ func TestGauge(t *testing.T) {
testMarshal(t, g, "10")
}

func TestFunctionalGauge(t *testing.T) {
valToReturn := int64(10)
g := NewFunctionalGauge(emptyMetadata, func() int64 { return valToReturn })
if v := g.Value(); v != 10 {
t.Fatalf("unexpected value: %d", v)
}
valToReturn = 15
if v := g.Value(); v != 15 {
t.Fatalf("unexpected value: %d", v)
}
}

func TestGaugeFloat64(t *testing.T) {
g := NewGaugeFloat64(emptyMetadata)
g.Update(10.4)
Expand Down

0 comments on commit 43ca188

Please sign in to comment.