Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
storage, ui: add metrics for Raft messages
Browse files Browse the repository at this point in the history
Add additional metrics for Raft messages to help in debugging: total
messages sent and received, transport queue length, dropped messages,
and ticks. Expose these metrics in the Admin UI, under the "Advanced
Internals" section. Closes cockroachdb#8645.
Arjun Narayan committed Aug 25, 2016
1 parent 75921d4 commit 6375bee
Showing 5 changed files with 285 additions and 7 deletions.
37 changes: 37 additions & 0 deletions storage/replica.go
Original file line number Diff line number Diff line change
@@ -1536,6 +1536,20 @@ func defaultProposeRaftCommandLocked(r *Replica, p *pendingCmd) error {
})
}

func (r *Replica) isLeaseHolder() bool {
lease := r.mu.state.Lease
if lease == nil {
return false
}
timestamp := r.store.Clock().Now()
if lease.Covers(timestamp) {
if lease.OwnedBy(r.store.Ident.StoreID) {
return true
}
}
return false
}

func (r *Replica) handleRaftReady() error {
ctx := context.TODO()
var hasReady bool
@@ -1545,6 +1559,18 @@ func (r *Replica) handleRaftReady() error {
lastIndex := r.mu.lastIndex // used for append below
raftLogSize := r.mu.raftLogSize
leaderID := r.mu.leaderID

if leaderID == r.mu.replicaID {
r.store.metrics.raftLeaders.Inc(1)
}

if r.isLeaseHolder() {
r.store.metrics.raftLeaseHolders.Inc(1)
if leaderID != r.mu.replicaID {
r.store.metrics.raftLeaseHoldersWithoutLeadership.Inc(1)
}
}

err := r.withRaftGroupLocked(func(raftGroup *raft.RawNode) error {
if hasReady = raftGroup.HasReady(); hasReady {
rd = raftGroup.Ready()
@@ -1872,6 +1898,16 @@ func (r *Replica) sendRaftMessage(msg raftpb.Message) {
return
}

r.store.ctx.Transport.mu.Lock()
transportQueues := r.store.ctx.Transport.mu.queues[false]
var queuedMsgs int64
for _, queue := range transportQueues {
queuedMsgs += int64(len(queue))
}
r.store.ctx.Transport.mu.Unlock()
r.store.metrics.RaftSentPending.Update(queuedMsgs)
r.store.metrics.RaftSentMessages.Inc(1)

if !r.store.ctx.Transport.SendAsync(&RaftMessageRequest{
RangeID: rangeID,
ToReplica: toReplica,
@@ -1880,6 +1916,7 @@ func (r *Replica) sendRaftMessage(msg raftpb.Message) {
}) {
r.mu.Lock()
r.mu.droppedMessages++
r.store.metrics.RaftDroppedMessages.Inc(1)
r.mu.Unlock()

if err := r.withRaftGroup(func(raftGroup *raft.RawNode) error {
14 changes: 14 additions & 0 deletions storage/store.go
Original file line number Diff line number Diff line change
@@ -2202,6 +2202,9 @@ func (s *Store) HandleRaftRequest(ctx context.Context, req *RaftMessageRequest)
}

addedPlaceholder := false

s.metrics.raftRcvdMessages[req.Message.Type].Inc(1)

switch req.Message.Type {
case raftpb.MsgSnap:
if earlyReturn := func() bool {
@@ -2512,6 +2515,10 @@ func (s *Store) processRaft() {
// applying snapshots). We therefore process all uninitialized
// replicas serially, before starting initialized replicas in
// parallel.
s.metrics.raftLeaders.Clear()
s.metrics.raftLeaseHolders.Clear()
s.metrics.raftLeaseHoldersWithoutLeadership.Clear()

for _, r := range uninitReplicas {
start := timeutil.Now()
if err := r.handleRaftReady(); err != nil {
@@ -2540,7 +2547,11 @@ func (s *Store) processRaft() {
// replicas, clear all remaining placeholders.
s.clearAllPlaceholders()
s.processRaftMu.Unlock()

s.metrics.RaftWorkingDurationNanos.Inc(timeutil.Since(workingStart).Nanoseconds())
s.metrics.RaftLeaders.Update(s.metrics.raftLeaders.Count())
s.metrics.RaftLeaseHolders.Update(s.metrics.raftLeaseHolders.Count())
s.metrics.RaftLeaseHoldersWithoutLeadership.Update(s.metrics.raftLeaseHoldersWithoutLeadership.Count())

maybeWarnDuration(workingStart, s, "raft ready processing")

@@ -2576,6 +2587,9 @@ func (s *Store) processRaft() {
}
}
s.mu.Unlock()

s.metrics.RaftTicks.Inc(1)

// Enqueue all pending ranges for readiness checks. Note that we could
// not hold the pendingRaftGroups lock during the previous loop because
// of lock ordering constraints with r.tick().
185 changes: 182 additions & 3 deletions storage/store_metrics.go
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/storage/engine/enginepb"
"github.com/cockroachdb/cockroach/util/metric"
"github.com/cockroachdb/cockroach/util/syncutil"
"github.com/coreos/etcd/raft/raftpb"
)

var (
@@ -74,9 +75,106 @@ var (
metaRangeSnapshotsPreemptiveApplied = metric.Metadata{Name: "range.snapshots.preemptive-applied"}

// Raft processing metrics.
metaRaftSelectDurationNanos = metric.Metadata{Name: "process-raft.waitingnanos"}
metaRaftWorkingDurationNanos = metric.Metadata{Name: "process-raft.workingnanos"}
metaRaftTickingDurationNanos = metric.Metadata{Name: "process-raft.tickingnanos"}
metaRaftSelectDurationNanos = metric.Metadata{Name: "raft.process.waitingnanos",
Help: "Nanoseconds spent in store.processRaft() waiting"}
metaRaftWorkingDurationNanos = metric.Metadata{Name: "raft.process.workingnanos",
Help: "Nanoseconds spent in store.processRaft() working"}
metaRaftTickingDurationNanos = metric.Metadata{Name: "raft.process.tickingnanos",
Help: "Nanoseconds spent in store.processRaft() processing replica.Tick()"}

// Raft message metrics.
metaRaftReceivedHup = metric.Metadata{
Name: "raft.received.hup",
Help: "Total number of MsgHup messages received by this store",
}
metaRaftReceivedBeat = metric.Metadata{
Name: "raft.received.beat",
Help: "Total number of MsgBeat messages received by this store",
}
metaRaftReceivedProp = metric.Metadata{
Name: "raft.received.prop",
Help: "Total number of MsgProp messages received by this store",
}
metaRaftReceivedApp = metric.Metadata{
Name: "raft.received.app",
Help: "Total number of MsgApp messages received by this store",
}
metaRaftReceivedAppResp = metric.Metadata{
Name: "raft.received.appresp",
Help: "Total number of MsgAppResp messages received by this store",
}
metaRaftReceivedVote = metric.Metadata{
Name: "raft.received.vote",
Help: "Total number of MsgVote messages received by this store",
}
metaRaftReceivedVoteResp = metric.Metadata{
Name: "raft.received.voteresp",
Help: "Total number of MsgVoteResp messages received by this store",
}
metaRaftReceivedSnap = metric.Metadata{
Name: "raft.received.snap",
Help: "Total number of MsgSnap messages received by this store",
}
metaRaftReceivedHeartbeat = metric.Metadata{
Name: "raft.received.heartbeat",
Help: "Total number of MsgHeartbeat messages received by this store",
}
metaRaftReceivedHeartbeatResp = metric.Metadata{
Name: "raft.received.heartbeatresp",
Help: "Total number of MsgHeartbeatResp messages received by this store",
}
metaRaftReceivedUnreachable = metric.Metadata{
Name: "raft.received.unreachable",
Help: "Total number of MsgUnreachable messages received by this store",
}
metaRaftReceivedSnapStatus = metric.Metadata{
Name: "raft.received.snapstatus",
Help: "Total number of MsgSnapStatus messages received by this store",
}
metaRaftReceivedCheckQuorum = metric.Metadata{
Name: "raft.received.checkquorum",
Help: "Total number of MsgCheckQuorum messages received by this store",
}
metaRaftReceivedTransferLeader = metric.Metadata{
Name: "raft.received.transferleader",
Help: "Total number of MsgTransferLeader messages received by this store",
}
metaRaftReceivedTimeoutNow = metric.Metadata{
Name: "raft.received.timeoutnow",
Help: "Total number of MsgTimeoutNow messages received by this store",
}

metaRaftSentMessages = metric.Metadata{
Name: "raft.sent.total",
Help: "Total number of outgoing messages queued from this store (including messages that were later dropped)",
}

metaRaftDroppedMessages = metric.Metadata{
Name: "raft.sent.dropped",
Help: "Total number of outgoing messages from this store that were dropped by the transport",
}

metaRaftTicks = metric.Metadata{
Name: "raft.ticks",
Help: "Total number of Raft ticks processed"}

metaRaftLeaders = metric.Metadata{
Name: "raft.leaders",
Help: "Total number of Ranges on this store that are leaders.",
}

metaRaftLeaseHolders = metric.Metadata{
Name: "raft.leaseholders",
Help: "Total number of Ranges on this store that are leaseholders.",
}

metaRaftLeaseHoldersWithoutLeadership = metric.Metadata{
Name: "raft.leaseholders.without.leadership",
Help: "Total number of Ranges on this store that are leaseholders but not Raft leaders.",
}

metaRaftSentPending = metric.Metadata{Name: "raft.sent.pending",
Help: "Number of pending outgoing messages in the Raft Transport queue"}
)

// StoreMetrics is the set of metrics for a given store.
@@ -141,6 +239,42 @@ type StoreMetrics struct {
RaftWorkingDurationNanos *metric.Counter
RaftTickingDurationNanos *metric.Counter

// Raft message metrics.
RaftRcvdMsgHup *metric.Counter
RaftRcvdMsgBeat *metric.Counter
RaftRcvdMsgProp *metric.Counter
RaftRcvdMsgApp *metric.Counter
RaftRcvdMsgAppResp *metric.Counter
RaftRcvdMsgVote *metric.Counter
RaftRcvdMsgVoteResp *metric.Counter
RaftRcvdMsgSnap *metric.Counter
RaftRcvdMsgHeartbeat *metric.Counter
RaftRcvdMsgHeartbeatResp *metric.Counter
RaftRcvdMsgUnreachable *metric.Counter
RaftRcvdMsgSnapStatus *metric.Counter
RaftRcvdMsgCheckQuorum *metric.Counter
RaftRcvdMsgTransferLeader *metric.Counter
RaftRcvdMsgTimeoutNow *metric.Counter

// A map for conveniently finding the appropriate metric. The individual
// metric references must exist as AddMetricStruct adds them by reflection
// on this struct and does not process map types.
// TODO(arjun): eliminate this duplication.
raftRcvdMessages map[raftpb.MessageType]*metric.Counter

RaftDroppedMessages *metric.Counter
RaftSentMessages *metric.Counter
RaftTicks *metric.Counter
RaftSentPending *metric.Gauge

// These three gauges need to be atomically updated, but the values are aggregated from individual replicas using the private values.
RaftLeaders *metric.Gauge
raftLeaders *metric.Counter
RaftLeaseHolders *metric.Gauge
raftLeaseHolders *metric.Counter
RaftLeaseHoldersWithoutLeadership *metric.Gauge
raftLeaseHoldersWithoutLeadership *metric.Counter

// Stats for efficient merges.
// TODO(mrtracy): This should be removed as part of #4465. This is only
// maintained to keep the current structure of StatusSummaries; it would be
@@ -206,8 +340,53 @@ func newStoreMetrics() *StoreMetrics {
RaftSelectDurationNanos: metric.NewCounter(metaRaftSelectDurationNanos),
RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos),
RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos),

// Raft message metrics.
RaftRcvdMsgHup: metric.NewCounter(metaRaftReceivedHup),
RaftRcvdMsgBeat: metric.NewCounter(metaRaftReceivedBeat),
RaftRcvdMsgProp: metric.NewCounter(metaRaftReceivedProp),
RaftRcvdMsgApp: metric.NewCounter(metaRaftReceivedApp),
RaftRcvdMsgAppResp: metric.NewCounter(metaRaftReceivedAppResp),
RaftRcvdMsgVote: metric.NewCounter(metaRaftReceivedVote),
RaftRcvdMsgVoteResp: metric.NewCounter(metaRaftReceivedVoteResp),
RaftRcvdMsgSnap: metric.NewCounter(metaRaftReceivedSnap),
RaftRcvdMsgHeartbeat: metric.NewCounter(metaRaftReceivedHeartbeat),
RaftRcvdMsgHeartbeatResp: metric.NewCounter(metaRaftReceivedHeartbeatResp),
RaftRcvdMsgUnreachable: metric.NewCounter(metaRaftReceivedUnreachable),
RaftRcvdMsgSnapStatus: metric.NewCounter(metaRaftReceivedSnapStatus),
RaftRcvdMsgCheckQuorum: metric.NewCounter(metaRaftReceivedCheckQuorum),
RaftRcvdMsgTransferLeader: metric.NewCounter(metaRaftReceivedTransferLeader),
RaftRcvdMsgTimeoutNow: metric.NewCounter(metaRaftReceivedTimeoutNow),
raftRcvdMessages: make(map[raftpb.MessageType]*metric.Counter, len(raftpb.MessageType_name)),

RaftSentMessages: metric.NewCounter(metaRaftSentMessages),
RaftDroppedMessages: metric.NewCounter(metaRaftDroppedMessages),
RaftTicks: metric.NewCounter(metaRaftTicks),
RaftLeaders: metric.NewGauge(metaRaftLeaders),
raftLeaders: metric.NewCounter(metaRaftLeaders),
RaftLeaseHolders: metric.NewGauge(metaRaftLeaseHolders),
raftLeaseHolders: metric.NewCounter(metaRaftLeaseHolders),
RaftLeaseHoldersWithoutLeadership: metric.NewGauge(metaRaftLeaseHoldersWithoutLeadership),
raftLeaseHoldersWithoutLeadership: metric.NewCounter(metaRaftLeaseHoldersWithoutLeadership),
RaftSentPending: metric.NewGauge(metaRaftSentPending),
}

sm.raftRcvdMessages[raftpb.MsgHup] = sm.RaftRcvdMsgHup
sm.raftRcvdMessages[raftpb.MsgBeat] = sm.RaftRcvdMsgBeat
sm.raftRcvdMessages[raftpb.MsgProp] = sm.RaftRcvdMsgProp
sm.raftRcvdMessages[raftpb.MsgApp] = sm.RaftRcvdMsgApp
sm.raftRcvdMessages[raftpb.MsgAppResp] = sm.RaftRcvdMsgAppResp
sm.raftRcvdMessages[raftpb.MsgVote] = sm.RaftRcvdMsgVote
sm.raftRcvdMessages[raftpb.MsgVoteResp] = sm.RaftRcvdMsgVoteResp
sm.raftRcvdMessages[raftpb.MsgSnap] = sm.RaftRcvdMsgSnap
sm.raftRcvdMessages[raftpb.MsgHeartbeat] = sm.RaftRcvdMsgHeartbeat
sm.raftRcvdMessages[raftpb.MsgHeartbeatResp] = sm.RaftRcvdMsgHeartbeatResp
sm.raftRcvdMessages[raftpb.MsgUnreachable] = sm.RaftRcvdMsgUnreachable
sm.raftRcvdMessages[raftpb.MsgSnapStatus] = sm.RaftRcvdMsgSnapStatus
sm.raftRcvdMessages[raftpb.MsgCheckQuorum] = sm.RaftRcvdMsgCheckQuorum
sm.raftRcvdMessages[raftpb.MsgTransferLeader] = sm.RaftRcvdMsgTransferLeader
sm.raftRcvdMessages[raftpb.MsgTimeoutNow] = sm.RaftRcvdMsgTimeoutNow

storeRegistry.AddMetricStruct(sm)

return sm
54 changes: 51 additions & 3 deletions ui/app/containers/nodeGraphs.tsx
Original file line number Diff line number Diff line change
@@ -213,11 +213,59 @@ export default class extends React.Component<IInjectedProps, {}> {

<StackedAreaGraph title="Raft Time" sources={sources}>
<Axis label="Milliseconds" format={ (n) => d3.format(".1f")(NanoToMilli(n)) }>
<Metric name="cr.store.process-raft.waitingnanos" title="Waiting" nonNegativeRate />
<Metric name="cr.store.process-raft.workingnanos" title="Working" nonNegativeRate />
<Metric name="cr.store.process-raft.tickingnanos" title="Ticking" nonNegativeRate />
<Metric name="cr.store.raft.process.waitingnanos" title="Waiting" nonNegativeRate />
<Metric name="cr.store.raft.process.workingnanos" title="Working" nonNegativeRate />
<Metric name="cr.store.raft.process.tickingnanos" title="Ticking" nonNegativeRate />
</Axis>
</StackedAreaGraph>

<StackedAreaGraph title="Raft Messages received" sources={sources}>
<Axis label="Count" format={ d3.format(".1f") }>
<Metric name="cr.store.raft.received.hup" title="MsgHup" nonNegativeRate />
<Metric name="cr.store.raft.received.beat" title="MsgBeat" nonNegativeRate />
<Metric name="cr.store.raft.received.prop" title="MsgProp" nonNegativeRate />
<Metric name="cr.store.raft.received.app" title="MsgApp" nonNegativeRate />
<Metric name="cr.store.raft.received.appresp" title="MsgAppResp" nonNegativeRate />
<Metric name="cr.store.raft.received.vote" title="MsgVote" nonNegativeRate />
<Metric name="cr.store.raft.received.voteresp" title="MsgVoteResp" nonNegativeRate />
<Metric name="cr.store.raft.received.snap" title="MsgSnap" nonNegativeRate />
<Metric name="cr.store.raft.received.heartbeat" title="MsgHeartbeat" nonNegativeRate />
<Metric name="cr.store.raft.received.heartbeatresp" title="MsgHeartbeatResp" nonNegativeRate />
<Metric name="cr.store.raft.received.unreachable" title="MsgUnreachable" nonNegativeRate />
<Metric name="cr.store.raft.received.snapstatus" title="MsgSnapStatus" nonNegativeRate />
<Metric name="cr.store.raft.received.checkquorum" title="MsgCheckQuorum" nonNegativeRate />
<Metric name="cr.store.raft.received.transferleader" title="MsgTransferLeader" nonNegativeRate />
<Metric name="cr.store.raft.received.timeoutnow" title="MsgTimeoutNow" nonNegativeRate />
</Axis>
</StackedAreaGraph>

<LineGraph title="Raft Transport Queue Pending Count" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.sent.pending" title="Outstanding message count in the Raft Transport queue" />
</Axis>
</LineGraph>

<LineGraph title="Outgoing Raft Messages" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.sent.total" title="Outgoing Raft Messages - Total" nonNegativeRate />
<Metric name="cr.store.raft.sent.dropped" title="Outgoing Raft Messages - Dropped" nonNegativeRate />
</Axis>
</LineGraph>

<LineGraph title="Raft Leaders and LeaseHolders" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.leaders" title="Ranges on this Store that are Raft Leaders" />
<Metric name="cr.store.raft.leaseholders" title="Ranges on this Store that are Raft LeaseHolders" />
<Metric name="cr.store.raft.leaseholders.without.leadership" title="Ranges on this Store that are Raft LeaseHolders but aren't Raft Leaders" />
</Axis>
</LineGraph>

<LineGraph title="Raft Ticks" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.ticks" title="Raft Ticks" nonNegativeRate />
</Axis>
</LineGraph>

</GraphGroup>
</div>
</div>;
2 changes: 1 addition & 1 deletion ui/embedded.go

Large diffs are not rendered by default.

0 comments on commit 6375bee

Please sign in to comment.