Skip to content

Commit

Permalink
ui: add decommissioning relevant graphs to metrics replication dashboard
Browse files Browse the repository at this point in the history
This change adds new graphs to the metrics replication
dashboard. New metrics visualized on the dashboard can be used
to help triage decommissioning issues. Metrics visualized
include:
- queue.replicate.addreplica.(success|error)
- queue.replicate.removereplica.(success|error)
- queue.replicate.replacedeadreplica.(success|error)
- queue.replicate.removedeadreplica.(success|error)
- queue.replicate.replacedecommissioningreplica.(success|error)
- queue.replicate.removedecommissioningreplica.(success|error)
- range.snapshots.recv-queue
- range.snapshots.unknown.rcvd-bytes
- range.snapshots.rebalancing.rcvd-bytes
- range.snapshots.recovery.rcvd-bytes

Release justification: low risk, high benefit changes to
existing functionality.

Resolves cockroachdb#86599

Release note (ui change): introduce new graphs on metrics
replication dashboard to improve decommissioning observability
  • Loading branch information
Santamaura committed Aug 25, 2022
1 parent 8888295 commit 57097ab
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,10 @@ export const CircuitBreakerTrippedEventsTooltip: React.FC = () => (
export const PausedFollowersTooltip: React.FC = () => (
<div>The number of nonessential followers that have replication paused.</div>
);

export const ReceiverSnapshotsQueuedTooltip: React.FC = () => (
<div>
The number of snapshots queued to be applied on a receiver which can only{" "}
send/accept 1 at a time per store.
</div>
);
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
CircuitBreakerTrippedReplicasTooltip,
LogicalBytesGraphTooltip,
PausedFollowersTooltip,
ReceiverSnapshotsQueuedTooltip,
} from "src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips";
import { cockroach } from "src/js/protos";
import TimeSeriesQueryAggregator = cockroach.ts.tspb.TimeSeriesQueryAggregator;
Expand Down Expand Up @@ -179,13 +180,45 @@ export default function (props: GraphDashboardProps) {

<LineGraph title="Snapshot Data Received" sources={storeSources}>
<Axis label="bytes">
{_.map(nodeIDs, nid => (
<>
<Metric
key={nid}
name="cr.store.range.snapshots.unknown.rcvd-bytes"
title={nodeDisplayName(nodesSummary, nid) + "-unknown"}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.store.range.snapshots.rebalancing.rcvd-bytes"
title={nodeDisplayName(nodesSummary, nid) + "-rebalancing"}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.store.range.snapshots.recovery.rcvd-bytes"
title={nodeDisplayName(nodesSummary, nid) + "-recovery"}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
</>
))}
</Axis>
</LineGraph>,
<LineGraph
title="Receiver Snapshots Queued"
sources={storeSources}
tooltip={ReceiverSnapshotsQueuedTooltip}
>
<Axis label="snapshots" units={AxisUnits.Count}>
{_.map(nodeIDs, nid => (
<Metric
key={nid}
name="cr.store.range.snapshots.rcvd-bytes"
name="cr.store.range.snapshots.recv-queue"
title={nodeDisplayName(nodesSummary, nid)}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
))}
</Axis>
Expand Down Expand Up @@ -241,5 +274,76 @@ export default function (props: GraphDashboardProps) {
))}
</Axis>
</LineGraph>,
<LineGraph
title="Replicate Queue Actions: Successes"
sources={storeSources}
>
<Axis label="replicas" units={AxisUnits.Count}>
<Metric
name="cr.store.queue.replicate.addreplica.success"
title={"Replicas Added / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removereplica.success"
title={"Replicas Removed / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedeadreplica.success"
title={"Dead Replicas Replaced / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedeadreplica.success"
title={"Dead Replicas Removed / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedecommissioningreplica.success"
title={"Decommissioning Replicas Replaced / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedecommissioningreplica.success"
title={"Decommissioning Replicas Removed / Sec"}
nonNegativeRate
/>
</Axis>
</LineGraph>,
<LineGraph title="Replicate Queue Actions: Failures" sources={storeSources}>
<Axis label="replicas" units={AxisUnits.Count}>
<Metric
name="cr.store.queue.replicate.addreplica.error"
title={"Replicas Added / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removereplica.error"
title={"Replicas Removed / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedeadreplica.error"
title={"Dead Replicas Replaced / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedeadreplica.error"
title={"Dead Replicas Removed / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedecommissioningreplica.error"
title={"Decommissioning Replicas Replaced / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedecommissioningreplica.error"
title={"Decommissioning Replicas Removed / Sec"}
nonNegativeRate
/>
</Axis>
</LineGraph>,
];
}

0 comments on commit 57097ab

Please sign in to comment.