Skip to content

Commit

Permalink
loqrecovery,cli: add debug recover verify command
Browse files Browse the repository at this point in the history
This commit adds debug recovery verify command which provides
the status of loss of quorum recovery plan application status.
The command is used after debug recover apply-plan was used to
stage a recovery plan on a cluster to check application progress.
It allows user to check which nodes still needs to be restarted,
outcome of recovery on restarted nodes and health of ranges on
the entire cluster.

Release note: None
  • Loading branch information
aliher1911 committed Feb 7, 2023
1 parent eb15802 commit 4b36ef5
Show file tree
Hide file tree
Showing 21 changed files with 1,837 additions and 576 deletions.
33 changes: 31 additions & 2 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -7613,6 +7613,7 @@ Support status: [reserved](#support-status)
| ----- | ---- | ----- | ----------- | -------------- |
| plan_id | [bytes](#cockroach.server.serverpb.RecoveryVerifyRequest-bytes) | | PlanID is ID of the plan to verify. | [reserved](#support-status) |
| decommissioned_node_ids | [int32](#cockroach.server.serverpb.RecoveryVerifyRequest-int32) | repeated | DecommissionedNodeIDs is a set of nodes that should be marked as decommissioned in the cluster when loss of quorum recovery successfully applies. | [reserved](#support-status) |
| max_reported_ranges | [int32](#cockroach.server.serverpb.RecoveryVerifyRequest-int32) | | MaxReportedRanges is the maximum number of failed ranges to report. If more unhealthy ranges are found, error will be returned alongside range to indicate that ranges were cut short. | [reserved](#support-status) |



Expand All @@ -7631,14 +7632,42 @@ Support status: [reserved](#support-status)
| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| statuses | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.NodeRecoveryStatus](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.NodeRecoveryStatus) | repeated | Statuses contain a list of recovery statuses of nodes updated during recovery. It also contains nodes that were expected to be live (not decommissioned by recovery) but failed to return status response. | [reserved](#support-status) |
| unavailable_ranges | [cockroach.roachpb.RangeDescriptor](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.roachpb.RangeDescriptor) | repeated | UnavailableRanges contains descriptors of ranges that failed health checks. | [reserved](#support-status) |
| decommissioned_node_ids | [int32](#cockroach.server.serverpb.RecoveryVerifyResponse-int32) | repeated | DecommissionedNodeIDs contains list of decommissioned node id's. Only nodes that were decommissioned by the plan would be listed here, not all historically decommissioned ones. | [reserved](#support-status) |
| unavailable_ranges | [RecoveryVerifyResponse.UnavailableRanges](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.UnavailableRanges) | | UnavailableRanges contains information about ranges that failed health check. | [reserved](#support-status) |
| decommissioned_node_statuses | [RecoveryVerifyResponse.DecommissionedNodeStatusesEntry](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.DecommissionedNodeStatusesEntry) | repeated | DecommissionedNodeStatuses contains a map of requested IDs with their corresponding liveness statuses. | [reserved](#support-status) |






<a name="cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.UnavailableRanges"></a>
#### RecoveryVerifyResponse.UnavailableRanges



| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| ranges | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.RangeRecoveryStatus](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.RangeRecoveryStatus) | repeated | Ranges contains descriptors of ranges that failed health check. If there are too many ranges to report, error would contain relevant message. | [reserved](#support-status) |
| error | [string](#cockroach.server.serverpb.RecoveryVerifyResponse-string) | | Error contains an optional error if ranges validation can't complete. | [reserved](#support-status) |





<a name="cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.server.serverpb.RecoveryVerifyResponse.DecommissionedNodeStatusesEntry"></a>
#### RecoveryVerifyResponse.DecommissionedNodeStatusesEntry



| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| key | [int32](#cockroach.server.serverpb.RecoveryVerifyResponse-int32) | | | |
| value | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.RecoveryVerifyResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | | |






## ListTenants

Expand Down
6 changes: 6 additions & 0 deletions pkg/base/test_server_args.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,12 @@ type TestClusterArgs struct {
// A copy of an entry from this map will be copied to each individual server
// and potentially adjusted according to ReplicationMode.
ServerArgsPerNode map[int]TestServerArgs

// If reusable listeners is true, then restart should keep listeners untouched
// so that servers are kept on the same ports. It is up to the test to provide
// proxy listeners that could be closed by test servers and then reused upon
// restart.
ReusableListeners bool
}

var (
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ go_test(
"//pkg/util/stop",
"//pkg/util/timeutil",
"//pkg/util/tracing",
"//pkg/util/uuid",
"//pkg/workload/examples",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
Expand Down
285 changes: 280 additions & 5 deletions pkg/cli/debug_recover_loss_of_quorum.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,39 @@ recovery steps to verify the data and ensure database consistency should be
taken ASAP. Those actions should be done at application level.
'debug recover' set of commands is used as a last resort to perform range
recovery operation. To perform recovery one should perform this sequence
recovery operation.
Loss of quorum recovery could be performed in two modes half-online and
offline. Half-online approach is a preferred one but offline approach is
preserved for compatibility with any existing tooling that may exist. The
main difference between approaches is how information in cluster is collected
and distributed within the cluster and how many nodes needs to be restarted
during recovery.
To perform recovery using half-online approach one should perform this sequence
of actions:
1. Run 'cockroach debug recover make-plan' in a half-online mode to collect
replica information from surviving nodes of a cluster and decide which
replicas should survive and up-replicate.
2. Run 'cockroach debug recover apply-plan' in half online mode to distribute
plan to surviving cluster nodes for application. At this point plan is staged
and can't be reverted.
3. Follow instructions from apply plan to perform a rolling restart of nodes
that need to update their storage. Restart should be done using appropriate
automation that used to run the cluster.
4. Optionally use 'cockroach debug recover verify' to check recovery progress
and resulting range health.
If it was possible to produce distribute and apply the plan, then cluster should
become operational again. It is not guaranteed that there's no data loss
and that all database consistency was not compromised.
If for whatever reasons half-online approach is not feasible or fails when
collecting info or distributing recovery plans, one could perform this sequence
of actions:
0. Decommission failed nodes preemptively to eliminate the possibility of
Expand All @@ -124,7 +156,7 @@ should be collected and made locally available for the next step.
on step 1. Planner will decide which replicas should survive and
up-replicate.
4. Run 'cockroach debug recover execute-plan' on every node using plan
4. Run 'cockroach debug recover apply-plan' on every node using plan
generated on the previous step. Each node will pick relevant portion of
the plan and update local replicas accordingly to restore quorum.
Expand All @@ -134,7 +166,28 @@ If it was possible to produce and apply the plan, then cluster should
become operational again. It is not guaranteed that there's no data loss
and that all database consistency was not compromised.
Example run:
Example run #1 (half-online mode):
If we have a cluster of 5 nodes 1-5 where we lost nodes 3 and 4. Each node
has two stores and they are numbered as 1,2 on node 1; 3,4 on node 2 etc.
Recovery commands to recover unavailable ranges would be (most command output
is omitted for brevity):
[cockroach@admin ~]$ cockroach debug recover make-plan --host cockroach-1.cockroachlabs.com --certs-dir=root_certs -o recovery-plan.json
[cockroach@admin ~]$ cockroach debug recover apply-plan --host cockroach-1.cockroachlabs.com --certs-dir=root_certs recovery-plan.json
Proceed with staging plan [y/N] y
Plan staged. To complete recovery restart nodes n2, n3.
[cockroach@admin ~]$ # restart-nodes 2 3 as instructed by apply-plan.
[cockroach@admin ~]$ cockroach debug recover verify --host cockroach-1.cockroachlabs.com --certs-dir=root_certs recovery-plan.json
Loss of quorum recovery is complete.
Example run #2 (offline mode):
If we have a cluster of 5 nodes 1-5 where we lost nodes 3 and 4. Each node
has two stores and they are numbered as 1,2 on node 1; 3,4 on node 2 etc.
Expand Down Expand Up @@ -174,14 +227,15 @@ var recoverCommands = []*cobra.Command{
debugRecoverCollectInfoCmd,
debugRecoverPlanCmd,
debugRecoverExecuteCmd,
//debugRecoverVerify,
debugRecoverVerifyCmd,
}

func init() {
debugRecoverCmd.AddCommand(
debugRecoverCollectInfoCmd,
debugRecoverPlanCmd,
debugRecoverExecuteCmd)
debugRecoverExecuteCmd,
debugRecoverVerifyCmd)
}

var debugRecoverCollectInfoCmd = &cobra.Command{
Expand Down Expand Up @@ -791,6 +845,227 @@ func applyRecoveryToLocalStore(
return err
}

var debugRecoverVerifyCmd = &cobra.Command{
Use: "verify [plan-file]",
Short: "verify loss of quorum recovery application status",
Long: `
Check cluster loss of quorum recovery state.
Verify command will check if all nodes applied recovery plan and that all
necessary nodes are decommissioned.
If invoked without a plan file, command will print status of all nodes in the
cluster.
The address of a single healthy cluster node must be provided using the --host
flag. This designated node will retrieve and check status of all nodes in the
cluster.
See debug recover command help for more details on how to use this command.
`,
Args: cobra.MaximumNArgs(1),
RunE: runDebugVerify,
}

func runDebugVerify(cmd *cobra.Command, args []string) error {
// We must have cancellable context here to obtain grpc client connection.
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()
stopper := stop.NewStopper()
defer stopper.Stop(ctx)

var updatePlan loqrecoverypb.ReplicaUpdatePlan
if len(args) > 0 {
planFile := args[0]
data, err := os.ReadFile(planFile)
if err != nil {
return errors.Wrapf(err, "failed to read plan file %q", planFile)
}
jsonpb := protoutil.JSONPb{Indent: " "}
if err = jsonpb.Unmarshal(data, &updatePlan); err != nil {
return errors.Wrapf(err, "failed to unmarshal plan from file %q", planFile)
}
}

// Plan statuses.
if len(updatePlan.Updates) > 0 {
_, _ = fmt.Printf("Checking application of recovery plan %s\n", updatePlan.PlanID)
}

c, finish, err := getAdminClient(ctx, serverCfg)
if err != nil {
return errors.Wrapf(err, "failed to get admin connection to cluster")
}
defer finish()
req := serverpb.RecoveryVerifyRequest{
DecommissionedNodeIDs: updatePlan.DecommissionedNodeIDs,
MaxReportedRanges: 20,
}
// Maybe switch to non-nullable?
if !updatePlan.PlanID.Equal(uuid.UUID{}) {
req.PendingPlanID = &updatePlan.PlanID
}
res, err := c.RecoveryVerify(ctx, &req)
if err != nil {
return errors.Wrapf(err, "failed to retrieve replica info from cluster")
}

if len(res.UnavailableRanges.Ranges) > 0 {
_, _ = fmt.Fprintf(stderr, "Unavailable ranges:\n")
for _, d := range res.UnavailableRanges.Ranges {
_, _ = fmt.Fprintf(stderr, " r%d : %s, start key %s\n",
d.RangeID, d.FailureType, d.StartKey)
}
}
if res.UnavailableRanges.Error != "" {
_, _ = fmt.Fprintf(stderr, "Failed to complete range health check: %s\n",
res.UnavailableRanges.Error)
}

diff := diffPlanWithNodeStatus(updatePlan, res.Statuses)
if len(diff.report) > 0 {
if len(updatePlan.Updates) > 0 {
_, _ = fmt.Fprintf(stderr, "Recovery plan application progress:\n")
} else {
_, _ = fmt.Fprintf(stderr, "Recovery plans:\n")
}
}
for _, line := range diff.report {
_, _ = fmt.Fprintf(stderr, "%s\n", line)
}

// Node statuses.
allDecommissioned := true
var b strings.Builder
for id, status := range res.DecommissionedNodeStatuses {
if !status.Decommissioned() {
b.WriteString(fmt.Sprintf(" n%d: %s\n", id, status))
allDecommissioned = false
}
}
if len(res.DecommissionedNodeStatuses) > 0 {
if allDecommissioned {
_, _ = fmt.Fprintf(stderr, "All dead nodes are decommissioned.\n")
} else {
_, _ = fmt.Fprintf(stderr, "Nodes not yet decommissioned:\n%s", b.String())
}
}

if len(updatePlan.Updates) > 0 {
if !allDecommissioned || diff.pending > 0 {
return errors.New("loss of quorum recovery is not finished yet")
}
if diff.errors > 0 || !res.UnavailableRanges.Empty() {
return errors.New("loss of quorum recovery did not fully succeed")
}
_, _ = fmt.Fprintf(stderr, "Loss of quorum recovery is complete.\n")
} else {
if diff.errors > 0 || !res.UnavailableRanges.Empty() {
return errors.New("cluster has unhealthy ranges")
}
}
return nil
}

type clusterDiff struct {
report []string
pending int
errors int
}

func (d *clusterDiff) append(line string) {
d.report = append(d.report, line)
}

func (d *clusterDiff) appendPending(line string) {
d.report = append(d.report, line)
d.pending++
}

func (d *clusterDiff) appendError(line string) {
d.report = append(d.report, line)
d.errors++
}

func diffPlanWithNodeStatus(
updatePlan loqrecoverypb.ReplicaUpdatePlan, nodes []loqrecoverypb.NodeRecoveryStatus,
) clusterDiff {
var result clusterDiff

nodesWithPlan := make(map[roachpb.NodeID]interface{})
for _, r := range updatePlan.Updates {
nodesWithPlan[r.NodeID()] = struct{}{}
}

// Sort statuses by node id for ease of readability.
sort.Slice(nodes, func(i, j int) bool {
return nodes[i].NodeID < nodes[j].NodeID
})
// Plan statuses.
if len(nodesWithPlan) > 0 {
// Invoked with plan, need to verify application of concrete plan to the
// cluster.
for _, status := range nodes {
if _, ok := nodesWithPlan[status.NodeID]; ok {
// Nodes that we expect plan to be pending or applied.
switch {
case status.AppliedPlanID != nil && status.AppliedPlanID.Equal(updatePlan.PlanID) && status.Error != "":
result.appendError(fmt.Sprintf(" plan application failed on node n%d: %s", status.NodeID, status.Error))
case status.AppliedPlanID != nil && status.AppliedPlanID.Equal(updatePlan.PlanID):
result.append(fmt.Sprintf(" plan applied successfully on node n%d", status.NodeID))
case status.PendingPlanID != nil && status.PendingPlanID.Equal(updatePlan.PlanID):
result.appendPending(fmt.Sprintf(" plan application pending on node n%d", status.NodeID))
case status.PendingPlanID != nil:
result.appendError(fmt.Sprintf(" unexpected staged plan %s on node n%d", *status.PendingPlanID, status.NodeID))
case status.PendingPlanID == nil:
result.appendError(fmt.Sprintf(" failed to find staged plan on node n%d", status.NodeID))
}
delete(nodesWithPlan, status.NodeID)
} else {
switch {
case status.PendingPlanID != nil && status.PendingPlanID.Equal(updatePlan.PlanID):
result.appendError(fmt.Sprintf(" plan staged on n%d but no replicas is planned for update on the node", status.NodeID))
case status.PendingPlanID != nil:
result.appendError(fmt.Sprintf(" unexpected staged plan %s on node n%d", *status.PendingPlanID, status.NodeID))
}
}
}
// Check if any nodes that must have a plan staged or applied are missing
// from received node statuses.
var missing []roachpb.NodeID
for k := range nodesWithPlan {
missing = append(missing, k)
}
sort.Slice(missing, func(i, j int) bool {
return missing[i] < missing[j]
})
for _, id := range missing {
result.appendError(fmt.Sprintf(" failed to find node n%d where plan must be staged", id))
}
} else {
// Invoked without a plan, just dump collected information without making
// any conclusions.
for _, status := range nodes {
if status.PendingPlanID != nil {
result.append(fmt.Sprintf(" node n%d staged plan: %s", status.NodeID,
*status.PendingPlanID))
}
switch {
case status.Error != "" && status.AppliedPlanID != nil:
result.append(fmt.Sprintf(" node n%d failed to apply plan %s: %s", status.NodeID,
*status.AppliedPlanID, status.Error))
case status.Error != "":
result.append(fmt.Sprintf(" node n%d failed to apply plan: %s", status.NodeID,
status.Error))
case status.AppliedPlanID != nil:
result.append(fmt.Sprintf(" node n%d applied plan: %s at %s", status.NodeID,
*status.AppliedPlanID, status.ApplyTimestamp))
}
}
}
return result
}

func formatNodeStores(locations []loqrecovery.NodeStores, indent string) string {
hasMultiStore := false
for _, v := range locations {
Expand Down
Loading

0 comments on commit 4b36ef5

Please sign in to comment.