Skip to content

Commit

Permalink
storage, ui: add metrics for Raft messages
Browse files Browse the repository at this point in the history
Add additional metrics for Raft messages to help in debugging: total
messages sent and received, transport queue length, dropped messages,
and ticks. Expose these metrics in the Admin UI, under the "Advanced
Internals" section. Closes cockroachdb#8645.
  • Loading branch information
Arjun Narayan committed Aug 25, 2016
1 parent 75921d4 commit dd801b1
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 4 deletions.
11 changes: 11 additions & 0 deletions storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -1872,6 +1872,16 @@ func (r *Replica) sendRaftMessage(msg raftpb.Message) {
return
}

r.store.ctx.Transport.mu.Lock()
transportQueues := r.store.ctx.Transport.mu.queues[false]
var queuedMsgs int64
for _, queue := range transportQueues {
queuedMsgs += int64(len(queue))
}
r.store.ctx.Transport.mu.Unlock()
r.store.metrics.RaftSentPending.Update(queuedMsgs)
r.store.metrics.RaftSentMessages.Inc(1)

if !r.store.ctx.Transport.SendAsync(&RaftMessageRequest{
RangeID: rangeID,
ToReplica: toReplica,
Expand All @@ -1880,6 +1890,7 @@ func (r *Replica) sendRaftMessage(msg raftpb.Message) {
}) {
r.mu.Lock()
r.mu.droppedMessages++
r.store.metrics.RaftDroppedMessages.Inc(1)
r.mu.Unlock()

if err := r.withRaftGroup(func(raftGroup *raft.RawNode) error {
Expand Down
6 changes: 6 additions & 0 deletions storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,9 @@ func (s *Store) HandleRaftRequest(ctx context.Context, req *RaftMessageRequest)
}

addedPlaceholder := false

s.metrics.raftRcvdMessages[req.Message.Type].Inc(1)

switch req.Message.Type {
case raftpb.MsgSnap:
if earlyReturn := func() bool {
Expand Down Expand Up @@ -2575,6 +2578,9 @@ func (s *Store) processRaft() {
pendingReplicas = append(pendingReplicas, id)
}
}

s.metrics.RaftTicks.Inc(1)

s.mu.Unlock()
// Enqueue all pending ranges for readiness checks. Note that we could
// not hold the pendingRaftGroups lock during the previous loop because
Expand Down
155 changes: 152 additions & 3 deletions storage/store_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/storage/engine/enginepb"
"github.com/cockroachdb/cockroach/util/metric"
"github.com/cockroachdb/cockroach/util/syncutil"
"github.com/coreos/etcd/raft/raftpb"
)

var (
Expand Down Expand Up @@ -74,9 +75,91 @@ var (
metaRangeSnapshotsPreemptiveApplied = metric.Metadata{Name: "range.snapshots.preemptive-applied"}

// Raft processing metrics.
metaRaftSelectDurationNanos = metric.Metadata{Name: "process-raft.waitingnanos"}
metaRaftWorkingDurationNanos = metric.Metadata{Name: "process-raft.workingnanos"}
metaRaftTickingDurationNanos = metric.Metadata{Name: "process-raft.tickingnanos"}
metaRaftSelectDurationNanos = metric.Metadata{Name: "process-raft.waitingnanos",
Help: "Nanoseconds spent in store.processRaft() waiting"}
metaRaftWorkingDurationNanos = metric.Metadata{Name: "process-raft.workingnanos",
Help: "Nanoseconds spent in store.processRaft() working"}
metaRaftTickingDurationNanos = metric.Metadata{Name: "process-raft.tickingnanos",
Help: "Nanoseconds spent in store.processRaft() processing replica.Tick()"}

// Raft message metrics.
metaRaftReceivedHup = metric.Metadata{
Name: "raft.received.hup",
Help: "Total number of MsgHup messages received by this store",
}
metaRaftReceivedBeat = metric.Metadata{
Name: "raft.received.beat",
Help: "Total number of MsgBeat messages received by this store",
}
metaRaftReceivedProp = metric.Metadata{
Name: "raft.received.prop",
Help: "Total number of MsgProp messages received by this store",
}
metaRaftReceivedApp = metric.Metadata{
Name: "raft.received.app",
Help: "Total number of MsgApp messages received by this store",
}
metaRaftReceivedAppResp = metric.Metadata{
Name: "raft.received.appresp",
Help: "Total number of MsgAppResp messages received by this store",
}
metaRaftReceivedVote = metric.Metadata{
Name: "raft.received.vote",
Help: "Total number of MsgVote messages received by this store",
}
metaRaftReceivedVoteResp = metric.Metadata{
Name: "raft.received.voteresp",
Help: "Total number of MsgVoteResp messages received by this store",
}
metaRaftReceivedSnap = metric.Metadata{
Name: "raft.received.snap",
Help: "Total number of MsgSnap messages received by this store",
}
metaRaftReceivedHeartbeat = metric.Metadata{
Name: "raft.received.heartbeat",
Help: "Total number of MsgHeartbeat messages received by this store",
}
metaRaftReceivedHeartbeatResp = metric.Metadata{
Name: "raft.received.heartbeatresp",
Help: "Total number of MsgHeartbeatResp messages received by this store",
}
metaRaftReceivedUnreachable = metric.Metadata{
Name: "raft.received.unreachable",
Help: "Total number of MsgUnreachable messages received by this store",
}
metaRaftReceivedSnapStatus = metric.Metadata{
Name: "raft.received.snapstatus",
Help: "Total number of MsgSnapStatus messages received by this store",
}
metaRaftReceivedCheckQuorum = metric.Metadata{
Name: "raft.received.checkquorum",
Help: "Total number of MsgCheckQuorum messages received by this store",
}
metaRaftReceivedTransferLeader = metric.Metadata{
Name: "raft.received.transferleader",
Help: "Total number of MsgTransferLeader messages received by this store",
}
metaRaftReceivedTimeoutNow = metric.Metadata{
Name: "raft.received.timeoutnow",
Help: "Total number of MsgTimeoutNow messages received by this store",
}

metaRaftSentMessages = metric.Metadata{
Name: "raft.sent.total",
Help: "Total number of outgoing messages queued from this store (including messages that were later dropped)",
}

metaRaftDroppedMessages = metric.Metadata{
Name: "raft.sent.dropped",
Help: "Total number of outgoing messages from this store that were dropped by the transport",
}

metaRaftTicks = metric.Metadata{
Name: "raft.ticks",
Help: "Total number of Raft ticks processed"}

metaRaftSentPending = metric.Metadata{Name: "raft.sent.pending",
Help: "Number of pending outgoing messages in the Raft Transport queue"}
)

// StoreMetrics is the set of metrics for a given store.
Expand Down Expand Up @@ -141,6 +224,33 @@ type StoreMetrics struct {
RaftWorkingDurationNanos *metric.Counter
RaftTickingDurationNanos *metric.Counter

// Raft message metrics.
RaftRcvdMsgHup *metric.Counter
RaftRcvdMsgBeat *metric.Counter
RaftRcvdMsgProp *metric.Counter
RaftRcvdMsgApp *metric.Counter
RaftRcvdMsgAppResp *metric.Counter
RaftRcvdMsgVote *metric.Counter
RaftRcvdMsgVoteResp *metric.Counter
RaftRcvdMsgSnap *metric.Counter
RaftRcvdMsgHeartbeat *metric.Counter
RaftRcvdMsgHeartbeatResp *metric.Counter
RaftRcvdMsgUnreachable *metric.Counter
RaftRcvdMsgSnapStatus *metric.Counter
RaftRcvdMsgCheckQuorum *metric.Counter
RaftRcvdMsgTransferLeader *metric.Counter
RaftRcvdMsgTimeoutNow *metric.Counter

// A map for conveniently finding the appropriate metric. The individual
// metric references must exist as AddMetricStruct adds them by reflection
// on this struct and does not process map types.
// TODO(arjun): eliminate this duplication.
raftRcvdMessages map[raftpb.MessageType]*metric.Counter
RaftDroppedMessages *metric.Counter
RaftSentMessages *metric.Counter
RaftTicks *metric.Counter
RaftSentPending *metric.Gauge

// Stats for efficient merges.
// TODO(mrtracy): This should be removed as part of #4465. This is only
// maintained to keep the current structure of StatusSummaries; it would be
Expand Down Expand Up @@ -206,8 +316,47 @@ func newStoreMetrics() *StoreMetrics {
RaftSelectDurationNanos: metric.NewCounter(metaRaftSelectDurationNanos),
RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos),
RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos),

// Raft message metrics.
RaftRcvdMsgHup: metric.NewCounter(metaRaftReceivedHup),
RaftRcvdMsgBeat: metric.NewCounter(metaRaftReceivedBeat),
RaftRcvdMsgProp: metric.NewCounter(metaRaftReceivedProp),
RaftRcvdMsgApp: metric.NewCounter(metaRaftReceivedApp),
RaftRcvdMsgAppResp: metric.NewCounter(metaRaftReceivedAppResp),
RaftRcvdMsgVote: metric.NewCounter(metaRaftReceivedVote),
RaftRcvdMsgVoteResp: metric.NewCounter(metaRaftReceivedVoteResp),
RaftRcvdMsgSnap: metric.NewCounter(metaRaftReceivedSnap),
RaftRcvdMsgHeartbeat: metric.NewCounter(metaRaftReceivedHeartbeat),
RaftRcvdMsgHeartbeatResp: metric.NewCounter(metaRaftReceivedHeartbeatResp),
RaftRcvdMsgUnreachable: metric.NewCounter(metaRaftReceivedUnreachable),
RaftRcvdMsgSnapStatus: metric.NewCounter(metaRaftReceivedSnapStatus),
RaftRcvdMsgCheckQuorum: metric.NewCounter(metaRaftReceivedCheckQuorum),
RaftRcvdMsgTransferLeader: metric.NewCounter(metaRaftReceivedTransferLeader),
RaftRcvdMsgTimeoutNow: metric.NewCounter(metaRaftReceivedTimeoutNow),
raftRcvdMessages: make(map[raftpb.MessageType]*metric.Counter, len(raftpb.MessageType_name)),

RaftSentMessages: metric.NewCounter(metaRaftSentMessages),
RaftDroppedMessages: metric.NewCounter(metaRaftDroppedMessages),
RaftTicks: metric.NewCounter(metaRaftTicks),
RaftSentPending: metric.NewGauge(metaRaftSentPending),
}

sm.raftRcvdMessages[raftpb.MsgHup] = sm.RaftRcvdMsgHup
sm.raftRcvdMessages[raftpb.MsgBeat] = sm.RaftRcvdMsgBeat
sm.raftRcvdMessages[raftpb.MsgProp] = sm.RaftRcvdMsgProp
sm.raftRcvdMessages[raftpb.MsgApp] = sm.RaftRcvdMsgApp
sm.raftRcvdMessages[raftpb.MsgAppResp] = sm.RaftRcvdMsgAppResp
sm.raftRcvdMessages[raftpb.MsgVote] = sm.RaftRcvdMsgVote
sm.raftRcvdMessages[raftpb.MsgVoteResp] = sm.RaftRcvdMsgVoteResp
sm.raftRcvdMessages[raftpb.MsgSnap] = sm.RaftRcvdMsgSnap
sm.raftRcvdMessages[raftpb.MsgHeartbeat] = sm.RaftRcvdMsgHeartbeat
sm.raftRcvdMessages[raftpb.MsgHeartbeatResp] = sm.RaftRcvdMsgHeartbeatResp
sm.raftRcvdMessages[raftpb.MsgUnreachable] = sm.RaftRcvdMsgUnreachable
sm.raftRcvdMessages[raftpb.MsgSnapStatus] = sm.RaftRcvdMsgSnapStatus
sm.raftRcvdMessages[raftpb.MsgCheckQuorum] = sm.RaftRcvdMsgCheckQuorum
sm.raftRcvdMessages[raftpb.MsgTransferLeader] = sm.RaftRcvdMsgTransferLeader
sm.raftRcvdMessages[raftpb.MsgTimeoutNow] = sm.RaftRcvdMsgTimeoutNow

storeRegistry.AddMetricStruct(sm)

return sm
Expand Down
40 changes: 40 additions & 0 deletions ui/app/containers/nodeGraphs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,46 @@ export default class extends React.Component<IInjectedProps, {}> {
<Metric name="cr.store.process-raft.tickingnanos" title="Ticking" nonNegativeRate />
</Axis>
</StackedAreaGraph>

<StackedAreaGraph title="Raft Messages received" sources={sources}>
<Axis label="Count" format={ d3.format(".1f") }>
<Metric name="cr.store.raft.received.hup" title="MsgHup" nonNegativeRate />
<Metric name="cr.store.raft.received.beat" title="MsgBeat" nonNegativeRate />
<Metric name="cr.store.raft.received.prop" title="MsgProp" nonNegativeRate />
<Metric name="cr.store.raft.received.app" title="MsgApp" nonNegativeRate />
<Metric name="cr.store.raft.received.appresp" title="MsgAppResp" nonNegativeRate />
<Metric name="cr.store.raft.received.vote" title="MsgVote" nonNegativeRate />
<Metric name="cr.store.raft.received.voteresp" title="MsgVoteResp" nonNegativeRate />
<Metric name="cr.store.raft.received.snap" title="MsgSnap" nonNegativeRate />
<Metric name="cr.store.raft.received.heartbeat" title="MsgHeartbeat" nonNegativeRate />
<Metric name="cr.store.raft.received.heartbeatresp" title="MsgHeartbeatResp" nonNegativeRate />
<Metric name="cr.store.raft.received.unreachable" title="MsgUnreachable" nonNegativeRate />
<Metric name="cr.store.raft.received.snapstatus" title="MsgSnapStatus" nonNegativeRate />
<Metric name="cr.store.raft.received.checkquorum" title="MsgCheckQuorum" nonNegativeRate />
<Metric name="cr.store.raft.received.transferleader" title="MsgTransferLeader" nonNegativeRate />
<Metric name="cr.store.raft.received.timeoutnow" title="MsgTimeoutNow" nonNegativeRate />
</Axis>
</StackedAreaGraph>

<LineGraph title="Raft Transport Queue Pending Count" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.sent.pending" title="Outstanding message count in the Raft Transport queue" nonNegativeRate />
</Axis>
</LineGraph>

<LineGraph title="Outgoing Raft Messages" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.sent.total" title="Outgoing Raft Messages - Total" nonNegativeRate />
<Metric name="cr.store.raft.sent.dropped" title="Outgoing Raft Messages - Dropped" nonNegativeRate />
</Axis>
</LineGraph>

<LineGraph title="Raft Ticks" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.ticks" title="Raft Ticks" nonNegativeRate />
</Axis>
</LineGraph>

</GraphGroup>
</div>
</div>;
Expand Down
2 changes: 1 addition & 1 deletion ui/embedded.go

Large diffs are not rendered by default.

0 comments on commit dd801b1

Please sign in to comment.