Skip to content

Commit

Permalink
ui: add decommissioning relevant graphs to metrics replication dashboard
Browse files Browse the repository at this point in the history
This change adds new graphs to the metrics replication
dashboard. New metrics visualized on the dashboard can be used
to help triage decommissioning issues. Metrics visualized
include:
- queue.replicate.addreplica.(success|error)
- queue.replicate.removereplica.(success|error)
- queue.replicate.replacedeadreplica.(success|error)
- queue.replicate.removedeadreplica.(success|error)
- queue.replicate.replacedecommissioningreplica.(success|error)
- queue.replicate.removedecommissioningreplica.(success|error)
- range.snapshots.recv-queue
- range.snapshots.unknown.rcvd-bytes
- range.snapshots.rebalancing.rcvd-bytes
- range.snapshots.recovery.rcvd-bytes

Release justification: low risk, high benefit changes to
existing functionality.

Resolves cockroachdb#86599

Release note (ui change): introduce new graphs on metrics
replication dashboard to improve decommissioning observability
  • Loading branch information
Santamaura committed Sep 1, 2022
1 parent cb57def commit e424d85
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,10 @@ export const CircuitBreakerTrippedEventsTooltip: React.FC = () => (
export const PausedFollowersTooltip: React.FC = () => (
<div>The number of nonessential followers that have replication paused.</div>
);

export const ReceiverSnapshotsQueuedTooltip: React.FC = () => (
<div>
The number of snapshots queued to be applied on a receiver which can only{" "}
accept 1 at a time per store.
</div>
);
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
CircuitBreakerTrippedReplicasTooltip,
LogicalBytesGraphTooltip,
PausedFollowersTooltip,
ReceiverSnapshotsQueuedTooltip,
} from "src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips";
import { cockroach } from "src/js/protos";
import TimeSeriesQueryAggregator = cockroach.ts.tspb.TimeSeriesQueryAggregator;
Expand Down Expand Up @@ -178,14 +179,39 @@ export default function (props: GraphDashboardProps) {
</LineGraph>,

<LineGraph title="Snapshot Data Received" sources={storeSources}>
<Axis label="bytes">
<Axis label="bytes" units={AxisUnits.Bytes}>
{_.map(nodeIDs, nid => (
<>
<Metric
key={nid}
name="cr.store.range.snapshots.rebalancing.rcvd-bytes"
title={nodeDisplayName(nodesSummary, nid) + "-rebalancing"}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.store.range.snapshots.recovery.rcvd-bytes"
title={nodeDisplayName(nodesSummary, nid) + "-recovery"}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
</>
))}
</Axis>
</LineGraph>,
<LineGraph
title="Receiver Snapshots Queued"
sources={storeSources}
tooltip={ReceiverSnapshotsQueuedTooltip}
>
<Axis label="snapshots" units={AxisUnits.Count}>
{_.map(nodeIDs, nid => (
<Metric
key={nid}
name="cr.store.range.snapshots.rcvd-bytes"
name="cr.store.range.snapshots.recv-queue"
title={nodeDisplayName(nodesSummary, nid)}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
))}
</Axis>
Expand Down Expand Up @@ -241,5 +267,89 @@ export default function (props: GraphDashboardProps) {
))}
</Axis>
</LineGraph>,
<LineGraph
title="Replicate Queue Actions: Successes"
sources={storeSources}
>
<Axis label="replicas" units={AxisUnits.Count}>
<Metric
name="cr.store.queue.replicate.addreplica.success"
title={"Replicas Added / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removereplica.success"
title={"Replicas Removed / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedeadreplica.success"
title={"Dead Replicas Replaced / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedeadreplica.success"
title={"Dead Replicas Removed / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedecommissioningreplica.success"
title={"Decommissioning Replicas Replaced / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedecommissioningreplica.success"
title={"Decommissioning Replicas Removed / Sec"}
nonNegativeRate
/>
</Axis>
</LineGraph>,
<LineGraph title="Replicate Queue Actions: Failures" sources={storeSources}>
<Axis label="replicas" units={AxisUnits.Count}>
<Metric
name="cr.store.queue.replicate.addreplica.error"
title={"Replicas Added Errors / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removereplica.error"
title={"Replicas Removed Errors / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedeadreplica.error"
title={"Dead Replicas Replaced Errors / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedeadreplica.error"
title={"Dead Replicas Removed Errors / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.replacedecommissioningreplica.error"
title={"Decommissioning Replicas Replaced Errors / Sec"}
nonNegativeRate
/>
<Metric
name="cr.store.queue.replicate.removedecommissioningreplica.error"
title={"Decommissioning Replicas Removed Errors / Sec"}
nonNegativeRate
/>
</Axis>
</LineGraph>,
<LineGraph title="Decommissioning Errors" sources={storeSources}>
<Axis label="replicas" units={AxisUnits.Count}>
{_.map(nodeIDs, nid => (
<Metric
key={nid}
name="cr.store.queue.replicate.replacedecommissioningreplica.error"
title={nodeDisplayName(nodesSummary, nid) + " - Replaced Errors / Sec"}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,
];
}

0 comments on commit e424d85

Please sign in to comment.