Skip to content

Commit

Permalink
storage, ui: add metrics for Raft messages
Browse files Browse the repository at this point in the history
Add additional metrics for Raft messages to help in debugging: total
messages sent and received, transport queue length, dropped messages,
and ticks. Expose these metrics in the Admin UI, under the "Advanced
Internals" section. Closes cockroachdb#8645.
  • Loading branch information
Arjun Narayan committed Aug 24, 2016
1 parent 514206e commit 6240fab
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 4 deletions.
11 changes: 11 additions & 0 deletions storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -1872,6 +1872,16 @@ func (r *Replica) sendRaftMessage(msg raftpb.Message) {
return
}

r.store.ctx.Transport.mu.Lock()
transportQueues := r.store.ctx.Transport.mu.queues[false]
var queuedMsgs int64
for _, queue := range transportQueues {
queuedMsgs += int64(len(queue))
}
r.store.metrics.RaftTransportQueueSize.Update(queuedMsgs)
r.store.ctx.Transport.mu.Unlock()
r.store.metrics.RaftSentMessages.Inc(1)

if !r.store.ctx.Transport.SendAsync(&RaftMessageRequest{
RangeID: rangeID,
ToReplica: toReplica,
Expand All @@ -1880,6 +1890,7 @@ func (r *Replica) sendRaftMessage(msg raftpb.Message) {
}) {
r.mu.Lock()
r.mu.droppedMessages++
r.store.metrics.RaftDroppedMessages.Inc(1)
r.mu.Unlock()

if err := r.withRaftGroup(func(raftGroup *raft.RawNode) error {
Expand Down
6 changes: 6 additions & 0 deletions storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,9 @@ func (s *Store) HandleRaftRequest(ctx context.Context, req *RaftMessageRequest)
}

addedPlaceholder := false

s.metrics.RaftRcvdMessages[req.Message.Type].Inc(1)

switch req.Message.Type {
case raftpb.MsgSnap:
if earlyReturn := func() bool {
Expand Down Expand Up @@ -2575,6 +2578,9 @@ func (s *Store) processRaft() {
pendingReplicas = append(pendingReplicas, id)
}
}

s.metrics.RaftTicks.Inc(1)

s.mu.Unlock()
// Enqueue all pending ranges for readiness checks. Note that we could
// not hold the pendingRaftGroups lock during the previous loop because
Expand Down
161 changes: 158 additions & 3 deletions storage/store_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/storage/engine/enginepb"
"github.com/cockroachdb/cockroach/util/metric"
"github.com/cockroachdb/cockroach/util/syncutil"
"github.com/coreos/etcd/raft/raftpb"
)

var (
Expand Down Expand Up @@ -74,9 +75,95 @@ var (
metaRangeSnapshotsPreemptiveApplied = metric.Metadata{Name: "range.snapshots.preemptive-applied"}

// Raft processing metrics.
metaRaftSelectDurationNanos = metric.Metadata{Name: "process-raft.waitingnanos"}
metaRaftWorkingDurationNanos = metric.Metadata{Name: "process-raft.workingnanos"}
metaRaftTickingDurationNanos = metric.Metadata{Name: "process-raft.tickingnanos"}
metaRaftSelectDurationNanos = metric.Metadata{Name: "process-raft.waitingnanos",
Help: "Nanoseconds spent in store.processRaft() waiting"}
metaRaftWorkingDurationNanos = metric.Metadata{Name: "process-raft.workingnanos",
Help: "Nanoseconds spent in store.processRaft() working"}
metaRaftTickingDurationNanos = metric.Metadata{Name: "process-raft.tickingnanos",
Help: "Nanoseconds spent in store.processRaft() processing replica.Tick()"}

// Raft message metrics.
metaRaftRcvdMessages = map[raftpb.MessageType]metric.Metadata{
raftpb.MsgHup: {
Name: "raft-messages.rcvd.MsgHup",
Help: "Total number of MsgHup messages received by this store",
},
raftpb.MsgBeat: {
Name: "raft-messages.rcvd.MsgBeat",
Help: "Total number of MsgBeat messages received by this store",
},
raftpb.MsgProp: {
Name: "raft-messages.rcvd.MsgProp",
Help: "Total number of MsgProp messages received by this store",
},
raftpb.MsgApp: {
Name: "raft-messages.rcvd.MsgApp",
Help: "Total number of MsgApp messages received by this store",
},
raftpb.MsgAppResp: {
Name: "raft-messages.rcvd.MsgAppResp",
Help: "Total number of MsgAppResp messages received by this store",
},
raftpb.MsgVote: {
Name: "raft-messages.rcvd.MsgVote",
Help: "Total number of MsgVote messages received by this store",
},
raftpb.MsgVoteResp: {
Name: "raft-messages.rcvd.MsgVoteResp",
Help: "Total number of MsgVoteResp messages received by this store",
},
raftpb.MsgSnap: {
Name: "raft-messages.rcvd.MsgSnap",
Help: "Total number of MsgSnap messages received by this store",
},
raftpb.MsgHeartbeat: {
Name: "raft-messages.rcvd.MsgHeartbeat",
Help: "Total number of MsgHeartbeat messages received by this store",
},
raftpb.MsgHeartbeatResp: {
Name: "raft-messages.rcvd.MsgHeartbeatResp",
Help: "Total number of MsgHeartbeatResp messages received by this store",
},
raftpb.MsgUnreachable: {
Name: "raft-messages.rcvd.MsgUnreachable",
Help: "Total number of MsgUnreachable messages received by this store",
},
raftpb.MsgSnapStatus: {
Name: "raft-messages.rcvd.MsgSnapStatus",
Help: "Total number of MsgSnapStatus messages received by this store",
},
raftpb.MsgCheckQuorum: {
Name: "raft-messages.rcvd.MsgCheckQuorum",
Help: "Total number of MsgCheckQuorum messages received by this store",
},
raftpb.MsgTransferLeader: {
Name: "raft-messages.rcvd.MsgTransferLeader",
Help: "Total number of MsgTransferLeader messages received by this store",
},
raftpb.MsgTimeoutNow: {
Name: "raft-messages.rcvd.MsgTimeoutNow",
Help: "Total number of MsgTimeoutNow messages received by this store",
},
}

metaRaftSentMessages = metric.Metadata{
Name: "raft-messages.sent.total",
Help: "Total number of outgoing messages queued from this store (including messages that were later dropped)",
}

metaRaftDroppedMessages = metric.Metadata{
Name: "raft-messages.sent.dropped",
Help: "Total number of outgoing messages from this store that were dropped by the transport",
}

metaRaftTicks = metric.Metadata{Name: "raft.ticks",
Help: "Total number of Raft ticks processed"}

metaRaftTransportQueueSize = metric.Metadata{Name: "raft.transportqueue.size",
Help: "Number of outstanding messages in the Raft Transport queue"}

metaRaftSnapshotCount = metric.Metadata{Name: "raft.snapshot.count",
Help: "Number of snapshots received by this store"}
)

// StoreMetrics is the set of metrics for a given store.
Expand Down Expand Up @@ -141,6 +228,34 @@ type StoreMetrics struct {
RaftWorkingDurationNanos *metric.Counter
RaftTickingDurationNanos *metric.Counter

// Raft message metrics.
RaftRcvdMsgHup *metric.Counter
RaftRcvdMsgBeat *metric.Counter
RaftRcvdMsgProp *metric.Counter
RaftRcvdMsgApp *metric.Counter
RaftRcvdMsgAppResp *metric.Counter
RaftRcvdMsgVote *metric.Counter
RaftRcvdMsgVoteResp *metric.Counter
RaftRcvdMsgSnap *metric.Counter
RaftRcvdMsgHeartbeat *metric.Counter
RaftRcvdMsgHeartbeatResp *metric.Counter
RaftRcvdMsgUnreachable *metric.Counter
RaftRcvdMsgSnapStatus *metric.Counter
RaftRcvdMsgCheckQuorum *metric.Counter
RaftRcvdMsgTransferLeader *metric.Counter
RaftRcvdMsgTimeoutNow *metric.Counter

// A map for conveniently finding the appropriate metric. The individual
// metric references must exist as AddMetricStruct adds them by reflection
// on this struct and does not process map types.
// TODO(arjun): eliminate this duplication.
RaftRcvdMessages map[raftpb.MessageType]*metric.Counter
RaftDroppedMessages *metric.Counter
RaftSentMessages *metric.Counter
RaftTicks *metric.Counter
RaftTransportQueueSize *metric.Gauge
RaftSnapshotCount *metric.Counter

// Stats for efficient merges.
// TODO(mrtracy): This should be removed as part of #4465. This is only
// maintained to keep the current structure of StatusSummaries; it would be
Expand Down Expand Up @@ -206,8 +321,48 @@ func newStoreMetrics() *StoreMetrics {
RaftSelectDurationNanos: metric.NewCounter(metaRaftSelectDurationNanos),
RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos),
RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos),

// Raft message metrics.
RaftRcvdMsgHup: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgHup]),
RaftRcvdMsgBeat: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgBeat]),
RaftRcvdMsgProp: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgProp]),
RaftRcvdMsgApp: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgApp]),
RaftRcvdMsgAppResp: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgAppResp]),
RaftRcvdMsgVote: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgVote]),
RaftRcvdMsgVoteResp: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgVoteResp]),
RaftRcvdMsgSnap: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgSnap]),
RaftRcvdMsgHeartbeat: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgHeartbeat]),
RaftRcvdMsgHeartbeatResp: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgHeartbeatResp]),
RaftRcvdMsgUnreachable: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgUnreachable]),
RaftRcvdMsgSnapStatus: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgSnapStatus]),
RaftRcvdMsgCheckQuorum: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgCheckQuorum]),
RaftRcvdMsgTransferLeader: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgTransferLeader]),
RaftRcvdMsgTimeoutNow: metric.NewCounter(metaRaftRcvdMessages[raftpb.MsgTimeoutNow]),
RaftRcvdMessages: make(map[raftpb.MessageType]*metric.Counter, len(metaRaftRcvdMessages)),

RaftSentMessages: metric.NewCounter(metaRaftSentMessages),
RaftDroppedMessages: metric.NewCounter(metaRaftDroppedMessages),
RaftTicks: metric.NewCounter(metaRaftTicks),
RaftTransportQueueSize: metric.NewGauge(metaRaftTransportQueueSize),
RaftSnapshotCount: metric.NewCounter(metaRaftSnapshotCount),
}

sm.RaftRcvdMessages[raftpb.MsgHup] = sm.RaftRcvdMsgHup
sm.RaftRcvdMessages[raftpb.MsgBeat] = sm.RaftRcvdMsgBeat
sm.RaftRcvdMessages[raftpb.MsgProp] = sm.RaftRcvdMsgProp
sm.RaftRcvdMessages[raftpb.MsgApp] = sm.RaftRcvdMsgApp
sm.RaftRcvdMessages[raftpb.MsgAppResp] = sm.RaftRcvdMsgAppResp
sm.RaftRcvdMessages[raftpb.MsgVote] = sm.RaftRcvdMsgVote
sm.RaftRcvdMessages[raftpb.MsgVoteResp] = sm.RaftRcvdMsgVoteResp
sm.RaftRcvdMessages[raftpb.MsgSnap] = sm.RaftRcvdMsgSnap
sm.RaftRcvdMessages[raftpb.MsgHeartbeat] = sm.RaftRcvdMsgHeartbeat
sm.RaftRcvdMessages[raftpb.MsgHeartbeatResp] = sm.RaftRcvdMsgHeartbeatResp
sm.RaftRcvdMessages[raftpb.MsgUnreachable] = sm.RaftRcvdMsgUnreachable
sm.RaftRcvdMessages[raftpb.MsgSnapStatus] = sm.RaftRcvdMsgSnapStatus
sm.RaftRcvdMessages[raftpb.MsgCheckQuorum] = sm.RaftRcvdMsgCheckQuorum
sm.RaftRcvdMessages[raftpb.MsgTransferLeader] = sm.RaftRcvdMsgTransferLeader
sm.RaftRcvdMessages[raftpb.MsgTimeoutNow] = sm.RaftRcvdMsgTimeoutNow

storeRegistry.AddMetricStruct(sm)

return sm
Expand Down
40 changes: 40 additions & 0 deletions ui/app/containers/nodeGraphs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,46 @@ export default class extends React.Component<IInjectedProps, {}> {
<Metric name="cr.store.process-raft.tickingnanos" title="Ticking" nonNegativeRate />
</Axis>
</StackedAreaGraph>

<StackedAreaGraph title="Raft Messages received" sources={sources}>
<Axis label="Count" format={ d3.format(".1f") }>
<Metric name="cr.store.raft-messages.rcvd.MsgHup" title="MsgHup" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgBeat" title="MsgBeat" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgProp" title="MsgProp" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgApp" title="MsgApp" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgAppResp" title="MsgAppResp" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgVote" title="MsgVote" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgVoteResp" title="MsgVoteResp" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgSnap" title="MsgSnap" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgHeartbeat" title="MsgHeartbeat" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgHeartbeatResp" title="MsgHeartbeatResp" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgUnreachable" title="MsgUnreachable" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgSnapStatus" title="MsgSnapStatus" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgCheckQuorum" title="MsgCheckQuorum" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgTransferLeader" title="MsgTransferLeader" nonNegativeRate />
<Metric name="cr.store.raft-messages.rcvd.MsgTimeoutNow" title="MsgTimeoutNow" nonNegativeRate />
</Axis>
</StackedAreaGraph>

<LineGraph title="Raft Transport Queue Size" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.transportqueue.size" title="Raft Transport Queue Size" nonNegativeRate />
</Axis>
</LineGraph>

<LineGraph title="Outgoing Raft Messages" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft-messages.sent.total" title="Outgoing Raft Messages - Total" nonNegativeRate />
<Metric name="cr.store.raft-messages.sent.dropped" title="Outgoing Raft Messages - Dropped" nonNegativeRate />
</Axis>
</LineGraph>

<LineGraph title="Raft Ticks" sources={sources}>
<Axis format={ d3.format(".1f") }>
<Metric name="cr.store.raft.ticks" title="Raft Ticks" nonNegativeRate />
</Axis>
</LineGraph>

</GraphGroup>
</div>
</div>;
Expand Down
2 changes: 1 addition & 1 deletion ui/embedded.go

Large diffs are not rendered by default.

0 comments on commit 6240fab

Please sign in to comment.