loqrecovery,cli: add debug recover verify command

This commit adds debug recovery verify command which provides the status of loss of quorum recovery plan application status. The command is used after debug recover apply-plan was used to stage a recovery plan on a cluster to check application progress. It allows user to check which nodes still needs to be restarted, outcome of recovery on restarted nodes and health of ranges on the entire cluster. Release note: None
cockroachdb · Feb 7, 2023 · 4b36ef5 · 4b36ef5
1 parent eb15802
commit 4b36ef5
Show file tree

Hide file tree

Showing 21 changed files with 1,837 additions and 576 deletions.
diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md
@@ -7613,6 +7613,7 @@ Support status: [reserved](#support-status)
 | ----- | ---- | ----- | ----------- | -------------- |
 | plan_id | [bytes](#cockroach.server.serverpb.RecoveryVerifyRequest-bytes) |  | PlanID is ID of the plan to verify. | [reserved](#support-status) |
 | decommissioned_node_ids | [int32](#cockroach.server.serverpb.RecoveryVerifyRequest-int32) | repeated | DecommissionedNodeIDs is a set of nodes that should be marked as decommissioned in the cluster when loss of quorum recovery successfully applies. | [reserved](#support-status) |
+| max_reported_ranges | [int32](#cockroach.server.serverpb.RecoveryVerifyRequest-int32) |  | MaxReportedRanges is the maximum number of failed ranges to report. If more unhealthy ranges are found, error will be returned alongside range to indicate that ranges were cut short. | [reserved](#support-status) |
 
 
 
@@ -7631,14 +7632,42 @@ Support status: [reserved](#support-status)
 | Field | Type | Label | Description | Support status |
 | ----- | ---- | ----- | ----------- | -------------- |
 | statuses | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.NodeRecoveryStatus](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.NodeRecoveryStatus) | repeated | Statuses contain a list of recovery statuses of nodes updated during recovery. It also contains nodes that were expected to be live (not decommissioned by recovery) but failed to return status response. | [reserved](#support-status) |
-| unavailable_ranges | [cockroach.roachpb.RangeDescriptor](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.roachpb.RangeDescriptor) | repeated | UnavailableRanges contains descriptors of ranges that failed health checks. | [reserved](#support-status) |
-| decommissioned_node_ids | [int32](#cockroach.server.serverpb.RecoveryVerifyResponse-int32) | repeated | DecommissionedNodeIDs contains list of decommissioned node id's. Only nodes that were decommissioned by the plan would be listed here, not all historically decommissioned ones. | [reserved](#support-status) |
+| unavailable_ranges | [RecoveryVerifyResponse.UnavailableRanges](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.UnavailableRanges) |  | UnavailableRanges contains information about ranges that failed health check. | [reserved](#support-status) |
+| decommissioned_node_statuses | [RecoveryVerifyResponse.DecommissionedNodeStatusesEntry](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.DecommissionedNodeStatusesEntry) | repeated | DecommissionedNodeStatuses contains a map of requested IDs with their corresponding liveness statuses. | [reserved](#support-status) |
 
 
 
 
 
 
+<a name="cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.UnavailableRanges"></a>
+#### RecoveryVerifyResponse.UnavailableRanges
+
+
+
+| Field | Type | Label | Description | Support status |
+| ----- | ---- | ----- | ----------- | -------------- |
+| ranges | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.RangeRecoveryStatus](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.RangeRecoveryStatus) | repeated | Ranges contains descriptors of ranges that failed health check. If there are too many ranges to report, error would contain relevant message. | [reserved](#support-status) |
+| error | [string](#cockroach.server.serverpb.RecoveryVerifyResponse-string) |  | Error contains an optional error if ranges validation can't complete. | [reserved](#support-status) |
+
+
+
+
+
+<a name="cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.DecommissionedNodeStatusesEntry"></a>
+#### RecoveryVerifyResponse.DecommissionedNodeStatusesEntry
+
+
+
+| Field | Type | Label | Description | Support status |
+| ----- | ---- | ----- | ----------- | -------------- |
+| key | [int32](#cockroach.server.serverpb.RecoveryVerifyResponse-int32) |  |  |  |
+| value | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) |  |  |  |
+
+
+
+
+
 
 ## ListTenants
 

diff --git a/pkg/base/test_server_args.go b/pkg/base/test_server_args.go
@@ -194,6 +194,12 @@ type TestClusterArgs struct {
 	// A copy of an entry from this map will be copied to each individual server
 	// and potentially adjusted according to ReplicationMode.
 	ServerArgsPerNode map[int]TestServerArgs
+
+	// If reusable listeners is true, then restart should keep listeners untouched
+	// so that servers are kept on the same ports. It is up to the test to provide
+	// proxy listeners that could be closed by test servers and then reused upon
+	// restart.
+	ReusableListeners bool
 }
 
 var (

diff --git a/pkg/cli/BUILD.bazel b/pkg/cli/BUILD.bazel
@@ -399,6 +399,7 @@ go_test(
         "//pkg/util/stop",
         "//pkg/util/timeutil",
         "//pkg/util/tracing",
+        "//pkg/util/uuid",
         "//pkg/workload/examples",
         "@com_github_cockroachdb_datadriven//:datadriven",
         "@com_github_cockroachdb_errors//:errors",

diff --git a/pkg/cli/debug_recover_loss_of_quorum.go b/pkg/cli/debug_recover_loss_of_quorum.go
@@ -104,7 +104,39 @@ recovery steps to verify the data and ensure database consistency should be
 taken ASAP. Those actions should be done at application level.
 
 'debug recover' set of commands is used as a last resort to perform range
-recovery operation. To perform recovery one should perform this sequence
+recovery operation.
+
+Loss of quorum recovery could be performed in two modes half-online and
+offline. Half-online approach is a preferred one but offline approach is
+preserved for compatibility with any existing tooling that may exist. The
+main difference between approaches is how information in cluster is collected
+and distributed within the cluster and how many nodes needs to be restarted
+during recovery.
+
+To perform recovery using half-online approach one should perform this sequence
+of actions:
+
+1. Run 'cockroach debug recover make-plan' in a half-online mode to collect
+replica information from surviving nodes of a cluster and decide which
+replicas should survive and up-replicate.
+
+2. Run 'cockroach debug recover apply-plan' in half online mode to distribute
+plan to surviving cluster nodes for application. At this point plan is staged
+and can't be reverted.
+
+3. Follow instructions from apply plan to perform a rolling restart of nodes
+that need to update their storage. Restart should be done using appropriate
+automation that used to run the cluster.
+
+4. Optionally use 'cockroach debug recover verify' to check recovery progress
+and resulting range health.
+
+If it was possible to produce distribute and apply the plan, then cluster should
+become operational again. It is not guaranteed that there's no data loss
+and that all database consistency was not compromised.
+
+If for whatever reasons half-online approach is not feasible or fails when
+collecting info or distributing recovery plans, one could perform this sequence
 of actions:
 
 0. Decommission failed nodes preemptively to eliminate the possibility of
@@ -124,7 +156,7 @@ should be collected and made locally available for the next step.
 on step 1. Planner will decide which replicas should survive and
 up-replicate.
 
-4. Run 'cockroach debug recover execute-plan' on every node using plan
+4. Run 'cockroach debug recover apply-plan' on every node using plan
 generated on the previous step. Each node will pick relevant portion of
 the plan and update local replicas accordingly to restore quorum.
 
@@ -134,7 +166,28 @@ If it was possible to produce and apply the plan, then cluster should
 become operational again. It is not guaranteed that there's no data loss
 and that all database consistency was not compromised.
 
-Example run:
+Example run #1 (half-online mode):
+
+If we have a cluster of 5 nodes 1-5 where we lost nodes 3 and 4. Each node
+has two stores and they are numbered as 1,2 on node 1; 3,4 on node 2 etc.
+Recovery commands to recover unavailable ranges would be (most command output
+is omitted for brevity):
+
+[cockroach@admin ~]$ cockroach debug recover make-plan --host cockroach-1.cockroachlabs.com --certs-dir=root_certs -o recovery-plan.json
+
+[cockroach@admin ~]$ cockroach debug recover apply-plan --host cockroach-1.cockroachlabs.com --certs-dir=root_certs recovery-plan.json
+
+Proceed with staging plan [y/N] y
+
+Plan staged. To complete recovery restart nodes n2, n3.
+
+[cockroach@admin ~]$ # restart-nodes 2 3 as instructed by apply-plan.
+
+[cockroach@admin ~]$ cockroach debug recover verify --host cockroach-1.cockroachlabs.com --certs-dir=root_certs recovery-plan.json
+
+Loss of quorum recovery is complete.
+
+Example run #2 (offline mode):
 
 If we have a cluster of 5 nodes 1-5 where we lost nodes 3 and 4. Each node
 has two stores and they are numbered as 1,2 on node 1; 3,4 on node 2 etc.
@@ -174,14 +227,15 @@ var recoverCommands = []*cobra.Command{
 	debugRecoverCollectInfoCmd,
 	debugRecoverPlanCmd,
 	debugRecoverExecuteCmd,
-	//debugRecoverVerify,
+	debugRecoverVerifyCmd,
 }
 
 func init() {
 	debugRecoverCmd.AddCommand(
 		debugRecoverCollectInfoCmd,
 		debugRecoverPlanCmd,
-		debugRecoverExecuteCmd)
+		debugRecoverExecuteCmd,
+		debugRecoverVerifyCmd)
 }
 
 var debugRecoverCollectInfoCmd = &cobra.Command{
@@ -791,6 +845,227 @@ func applyRecoveryToLocalStore(
 	return err
 }
 
+var debugRecoverVerifyCmd = &cobra.Command{
+	Use:   "verify [plan-file]",
+	Short: "verify loss of quorum recovery application status",
+	Long: `
+Check cluster loss of quorum recovery state.
+
+Verify command will check if all nodes applied recovery plan and that all
+necessary nodes are decommissioned.
+
+If invoked without a plan file, command will print status of all nodes in the
+cluster.
+
+The address of a single healthy cluster node must be provided using the --host
+flag. This designated node will retrieve and check status of all nodes in the
+cluster.
+
+See debug recover command help for more details on how to use this command.
+`,
+	Args: cobra.MaximumNArgs(1),
+	RunE: runDebugVerify,
+}
+
+func runDebugVerify(cmd *cobra.Command, args []string) error {
+	// We must have cancellable context here to obtain grpc client connection.
+	ctx, cancel := context.WithCancel(cmd.Context())
+	defer cancel()
+	stopper := stop.NewStopper()
+	defer stopper.Stop(ctx)
+
+	var updatePlan loqrecoverypb.ReplicaUpdatePlan
+	if len(args) > 0 {
+		planFile := args[0]
+		data, err := os.ReadFile(planFile)
+		if err != nil {
+			return errors.Wrapf(err, "failed to read plan file %q", planFile)
+		}
+		jsonpb := protoutil.JSONPb{Indent: "  "}
+		if err = jsonpb.Unmarshal(data, &updatePlan); err != nil {
+			return errors.Wrapf(err, "failed to unmarshal plan from file %q", planFile)
+		}
+	}
+
+	// Plan statuses.
+	if len(updatePlan.Updates) > 0 {
+		_, _ = fmt.Printf("Checking application of recovery plan %s\n", updatePlan.PlanID)
+	}
+
+	c, finish, err := getAdminClient(ctx, serverCfg)
+	if err != nil {
+		return errors.Wrapf(err, "failed to get admin connection to cluster")
+	}
+	defer finish()
+	req := serverpb.RecoveryVerifyRequest{
+		DecommissionedNodeIDs: updatePlan.DecommissionedNodeIDs,
+		MaxReportedRanges:     20,
+	}
+	// Maybe switch to non-nullable?
+	if !updatePlan.PlanID.Equal(uuid.UUID{}) {
+		req.PendingPlanID = &updatePlan.PlanID
+	}
+	res, err := c.RecoveryVerify(ctx, &req)
+	if err != nil {
+		return errors.Wrapf(err, "failed to retrieve replica info from cluster")
+	}
+
+	if len(res.UnavailableRanges.Ranges) > 0 {
+		_, _ = fmt.Fprintf(stderr, "Unavailable ranges:\n")
+		for _, d := range res.UnavailableRanges.Ranges {
+			_, _ = fmt.Fprintf(stderr, " r%d : %s, start key %s\n",
+				d.RangeID, d.FailureType, d.StartKey)
+		}
+	}
+	if res.UnavailableRanges.Error != "" {
+		_, _ = fmt.Fprintf(stderr, "Failed to complete range health check: %s\n",
+			res.UnavailableRanges.Error)
+	}
+
+	diff := diffPlanWithNodeStatus(updatePlan, res.Statuses)
+	if len(diff.report) > 0 {
+		if len(updatePlan.Updates) > 0 {
+			_, _ = fmt.Fprintf(stderr, "Recovery plan application progress:\n")
+		} else {
+			_, _ = fmt.Fprintf(stderr, "Recovery plans:\n")
+		}
+	}
+	for _, line := range diff.report {
+		_, _ = fmt.Fprintf(stderr, "%s\n", line)
+	}
+
+	// Node statuses.
+	allDecommissioned := true
+	var b strings.Builder
+	for id, status := range res.DecommissionedNodeStatuses {
+		if !status.Decommissioned() {
+			b.WriteString(fmt.Sprintf(" n%d: %s\n", id, status))
+			allDecommissioned = false
+		}
+	}
+	if len(res.DecommissionedNodeStatuses) > 0 {
+		if allDecommissioned {
+			_, _ = fmt.Fprintf(stderr, "All dead nodes are decommissioned.\n")
+		} else {
+			_, _ = fmt.Fprintf(stderr, "Nodes not yet decommissioned:\n%s", b.String())
+		}
+	}
+
+	if len(updatePlan.Updates) > 0 {
+		if !allDecommissioned || diff.pending > 0 {
+			return errors.New("loss of quorum recovery is not finished yet")
+		}
+		if diff.errors > 0 || !res.UnavailableRanges.Empty() {
+			return errors.New("loss of quorum recovery did not fully succeed")
+		}
+		_, _ = fmt.Fprintf(stderr, "Loss of quorum recovery is complete.\n")
+	} else {
+		if diff.errors > 0 || !res.UnavailableRanges.Empty() {
+			return errors.New("cluster has unhealthy ranges")
+		}
+	}
+	return nil
+}
+
+type clusterDiff struct {
+	report  []string
+	pending int
+	errors  int
+}
+
+func (d *clusterDiff) append(line string) {
+	d.report = append(d.report, line)
+}
+
+func (d *clusterDiff) appendPending(line string) {
+	d.report = append(d.report, line)
+	d.pending++
+}
+
+func (d *clusterDiff) appendError(line string) {
+	d.report = append(d.report, line)
+	d.errors++
+}
+
+func diffPlanWithNodeStatus(
+	updatePlan loqrecoverypb.ReplicaUpdatePlan, nodes []loqrecoverypb.NodeRecoveryStatus,
+) clusterDiff {
+	var result clusterDiff
+
+	nodesWithPlan := make(map[roachpb.NodeID]interface{})
+	for _, r := range updatePlan.Updates {
+		nodesWithPlan[r.NodeID()] = struct{}{}
+	}
+
+	// Sort statuses by node id for ease of readability.
+	sort.Slice(nodes, func(i, j int) bool {
+		return nodes[i].NodeID < nodes[j].NodeID
+	})
+	// Plan statuses.
+	if len(nodesWithPlan) > 0 {
+		// Invoked with plan, need to verify application of concrete plan to the
+		// cluster.
+		for _, status := range nodes {
+			if _, ok := nodesWithPlan[status.NodeID]; ok {
+				// Nodes that we expect plan to be pending or applied.
+				switch {
+				case status.AppliedPlanID != nil && status.AppliedPlanID.Equal(updatePlan.PlanID) && status.Error != "":
+					result.appendError(fmt.Sprintf(" plan application failed on node n%d: %s", status.NodeID, status.Error))
+				case status.AppliedPlanID != nil && status.AppliedPlanID.Equal(updatePlan.PlanID):
+					result.append(fmt.Sprintf(" plan applied successfully on node n%d", status.NodeID))
+				case status.PendingPlanID != nil && status.PendingPlanID.Equal(updatePlan.PlanID):
+					result.appendPending(fmt.Sprintf(" plan application pending on node n%d", status.NodeID))
+				case status.PendingPlanID != nil:
+					result.appendError(fmt.Sprintf(" unexpected staged plan %s on node n%d", *status.PendingPlanID, status.NodeID))
+				case status.PendingPlanID == nil:
+					result.appendError(fmt.Sprintf(" failed to find staged plan on node n%d", status.NodeID))
+				}
+				delete(nodesWithPlan, status.NodeID)
+			} else {
+				switch {
+				case status.PendingPlanID != nil && status.PendingPlanID.Equal(updatePlan.PlanID):
+					result.appendError(fmt.Sprintf(" plan staged on n%d but no replicas is planned for update on the node", status.NodeID))
+				case status.PendingPlanID != nil:
+					result.appendError(fmt.Sprintf(" unexpected staged plan %s on node n%d", *status.PendingPlanID, status.NodeID))
+				}
+			}
+		}
+		// Check if any nodes that must have a plan staged or applied are missing
+		// from received node statuses.
+		var missing []roachpb.NodeID
+		for k := range nodesWithPlan {
+			missing = append(missing, k)
+		}
+		sort.Slice(missing, func(i, j int) bool {
+			return missing[i] < missing[j]
+		})
+		for _, id := range missing {
+			result.appendError(fmt.Sprintf(" failed to find node n%d where plan must be staged", id))
+		}
+	} else {
+		// Invoked without a plan, just dump collected information without making
+		// any conclusions.
+		for _, status := range nodes {
+			if status.PendingPlanID != nil {
+				result.append(fmt.Sprintf(" node n%d staged plan: %s", status.NodeID,
+					*status.PendingPlanID))
+			}
+			switch {
+			case status.Error != "" && status.AppliedPlanID != nil:
+				result.append(fmt.Sprintf(" node n%d failed to apply plan %s: %s", status.NodeID,
+					*status.AppliedPlanID, status.Error))
+			case status.Error != "":
+				result.append(fmt.Sprintf(" node n%d failed to apply plan: %s", status.NodeID,
+					status.Error))
+			case status.AppliedPlanID != nil:
+				result.append(fmt.Sprintf(" node n%d applied plan: %s at %s", status.NodeID,
+					*status.AppliedPlanID, status.ApplyTimestamp))
+			}
+		}
+	}
+	return result
+}
+
 func formatNodeStores(locations []loqrecovery.NodeStores, indent string) string {
 	hasMultiStore := false
 	for _, v := range locations {