From 2bd238ae292ecfcea40a901e0ac688171850d888 Mon Sep 17 00:00:00 2001 From: Santamaura Date: Thu, 18 Aug 2022 10:25:06 -0400 Subject: [PATCH] ui, server: surface paused replicas in problem ranges, range report, and replication metrics This change surfaces paused replicas in the problem ranges page, in the range report, and as a new chart in the replication metrics. Release justification: low risk, high benefit changes to existing functionality. Resolves: #84489 Release note (ui change): surface paused replicas to range report, problem ranges, and replication metrics pages. --- docs/generated/http/full.md | 4 ++++ docs/generated/swagger/spec.json | 4 ++++ pkg/server/problem_ranges.go | 5 +++++ pkg/server/serverpb/status.proto | 6 ++++++ pkg/server/status.go | 1 + .../nodeGraphs/dashboards/graphTooltips.tsx | 4 ++++ .../nodeGraphs/dashboards/replication.tsx | 18 ++++++++++++++++++ .../problemRanges/connectionsTable.tsx | 7 ++++++- .../reports/containers/problemRanges/index.tsx | 5 +++++ .../reports/containers/range/rangeTable.tsx | 8 ++++++++ 10 files changed, 61 insertions(+), 1 deletion(-) diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md index b1d2f7551d91..08e940161f3d 100644 --- a/docs/generated/http/full.md +++ b/docs/generated/http/full.md @@ -1309,6 +1309,7 @@ RangeProblems describes issues reported by a range. For internal use only. | quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) | | raft_log_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) | | circuit_breaker_error | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) | +| paused_followers | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) | @@ -1554,6 +1555,7 @@ RangeProblems describes issues reported by a range. For internal use only. | quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) | | raft_log_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) | | circuit_breaker_error | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) | +| paused_followers | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) | @@ -3366,6 +3368,7 @@ Support status: [reserved](#support-status) | quiescent_equals_ticking_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | | raft_log_too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | | circuit_breaker_error_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | +| paused_replica_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | @@ -3758,6 +3761,7 @@ RangeProblems describes issues reported by a range. For internal use only. | quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) | | raft_log_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) | | circuit_breaker_error | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) | +| paused_followers | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) | diff --git a/docs/generated/swagger/spec.json b/docs/generated/swagger/spec.json index 7ec2c125eab6..41c97cfdb881 100644 --- a/docs/generated/swagger/spec.json +++ b/docs/generated/swagger/spec.json @@ -1257,6 +1257,10 @@ "type": "boolean", "x-go-name": "Overreplicated" }, + "paused_followers": { + "type": "boolean", + "x-go-name": "PausedFollowers" + }, "quiescent_equals_ticking": { "description": "Quiescent ranges do not tick by definition, but we track this in\ntwo different ways and suspect that they're getting out of sync.\nIf the replica's quiescent flag doesn't agree with the store's\nlist of replicas that are ticking, warn about it.", "type": "boolean", diff --git a/pkg/server/problem_ranges.go b/pkg/server/problem_ranges.go index 8c0c24c84d85..c47ecf0c69b5 100644 --- a/pkg/server/problem_ranges.go +++ b/pkg/server/problem_ranges.go @@ -133,6 +133,10 @@ func (s *statusServer) ProblemRanges( problems.CircuitBreakerErrorRangeIDs = append(problems.CircuitBreakerErrorRangeIDs, info.State.Desc.RangeID) } + if info.Problems.PausedFollowers { + problems.PausedReplicaIDs = + append(problems.PausedReplicaIDs, info.State.Desc.RangeID) + } } sort.Sort(roachpb.RangeIDSlice(problems.UnavailableRangeIDs)) sort.Sort(roachpb.RangeIDSlice(problems.RaftLeaderNotLeaseHolderRangeIDs)) @@ -143,6 +147,7 @@ func (s *statusServer) ProblemRanges( sort.Sort(roachpb.RangeIDSlice(problems.QuiescentEqualsTickingRangeIDs)) sort.Sort(roachpb.RangeIDSlice(problems.RaftLogTooLargeRangeIDs)) sort.Sort(roachpb.RangeIDSlice(problems.CircuitBreakerErrorRangeIDs)) + sort.Sort(roachpb.RangeIDSlice(problems.PausedReplicaIDs)) response.ProblemsByNodeID[resp.nodeID] = problems case <-ctx.Done(): return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error()) diff --git a/pkg/server/serverpb/status.proto b/pkg/server/serverpb/status.proto index 93452f3b93d1..54504e884ef1 100644 --- a/pkg/server/serverpb/status.proto +++ b/pkg/server/serverpb/status.proto @@ -389,6 +389,7 @@ message RangeProblems { // When the raft log is too large, it can be a symptom of other issues. bool raft_log_too_large = 7; bool circuit_breaker_error = 9; + bool paused_followers = 10; } // RangeStatistics describes statistics reported by a range. For internal use @@ -1223,6 +1224,11 @@ message ProblemRangesResponse { (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" ]; + repeated int64 paused_replica_ids = 11 [ + (gogoproto.customname) = "PausedReplicaIDs", + (gogoproto.casttype) = + "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" + ]; } reserved 1 to 7; // NodeID is the node that submitted all the requests. diff --git a/pkg/server/status.go b/pkg/server/status.go index 73ac3259965d..c2d5b2c581c3 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -2045,6 +2045,7 @@ func (s *statusServer) rangesHelper( QuiescentEqualsTicking: raftStatus != nil && metrics.Quiescent == metrics.Ticking, RaftLogTooLarge: metrics.RaftLogTooLarge, CircuitBreakerError: len(state.CircuitBreakerError) > 0, + PausedFollowers: metrics.PausedFollowerCount > 0, }, LeaseStatus: metrics.LeaseStatus, Quiescent: metrics.Quiescent, diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips.tsx index 16c436df3799..f562127c9de6 100644 --- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips.tsx +++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips.tsx @@ -165,3 +165,7 @@ export const CircuitBreakerTrippedEventsTooltip: React.FC = () => ( since the process started. ); + +export const PausedFollowersTooltip: React.FC = () => ( +
The number of nonessential followers that have replication paused.
+); diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx index f57b374d3d48..5411bd8d5d99 100644 --- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx +++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx @@ -24,6 +24,7 @@ import { CircuitBreakerTrippedEventsTooltip, CircuitBreakerTrippedReplicasTooltip, LogicalBytesGraphTooltip, + PausedFollowersTooltip, } from "src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips"; import { cockroach } from "src/js/protos"; import TimeSeriesQueryAggregator = cockroach.ts.tspb.TimeSeriesQueryAggregator; @@ -223,5 +224,22 @@ export default function (props: GraphDashboardProps) { ))} , + + + {_.map(nodeIDs, nid => ( + + ))} + + , ]; } diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx index e2f5ee04c9aa..4fffbbe7e130 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx @@ -73,6 +73,10 @@ const connectionTableColumns: ConnectionTableColumn[] = [ title: "Circuit breaker error", extract: problem => problem.circuit_breaker_error_range_ids.length, }, + { + title: "Paused Replicas", + extract: problem => problem.paused_replica_ids.length, + }, { title: "Total", extract: problem => { @@ -85,7 +89,8 @@ const connectionTableColumns: ConnectionTableColumn[] = [ problem.overreplicated_range_ids.length + problem.quiescent_equals_ticking_range_ids.length + problem.raft_log_too_large_range_ids.length + - problem.circuit_breaker_error_range_ids.length + problem.circuit_breaker_error_range_ids.length + + problem.paused_replica_ids.length ); }, }, diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx index 78cb4c454de3..5401729d8989 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx @@ -212,6 +212,11 @@ export class ProblemRanges extends React.Component { problems={problems} extract={problem => problem.circuit_breaker_error_range_ids} /> + problem.paused_replica_ids} + /> ); } diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx index 9e39b96bde9f..0e9a5f89e0e9 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx @@ -265,6 +265,11 @@ const rangeTableDisplayList: RangeTableRow[] = [ display: "Locality Info", compareToLeader: false, }, + { + variable: "pausedFollowers", + display: "Paused Followers", + compareToLeader: false, + }, ]; const rangeTableEmptyContent: RangeTableCellContent = { @@ -899,6 +904,9 @@ export default class RangeTable extends React.Component { tier => `${tier.key}: ${tier.value}`, ), })), + pausedFollowers: this.createContent( + info.state.paused_replicas?.join(", "), + ), }); });