diff --git a/pkg/cli/BUILD.bazel b/pkg/cli/BUILD.bazel index 2adc2051911c..0e9085be3ffd 100644 --- a/pkg/cli/BUILD.bazel +++ b/pkg/cli/BUILD.bazel @@ -210,6 +210,7 @@ go_library( "//pkg/workload/ycsb", "@com_github_cockroachdb_apd_v3//:apd", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_errors//hintdetail", "@com_github_cockroachdb_errors//oserror", "@com_github_cockroachdb_logtags//:logtags", "@com_github_cockroachdb_pebble//tool", diff --git a/pkg/cli/debug.go b/pkg/cli/debug.go index 08a9cdc9d082..2ff1d7823a09 100644 --- a/pkg/cli/debug.go +++ b/pkg/cli/debug.go @@ -1654,6 +1654,9 @@ func init() { "list of dead store IDs") f.VarP(&debugRecoverPlanOpts.confirmAction, cliflags.ConfirmActions.Name, cliflags.ConfirmActions.Shorthand, cliflags.ConfirmActions.Usage()) + f.BoolVar(&debugRecoverPlanOpts.force, "force", false, + "force creation of plan even when problems were encountered; applying this plan may "+ + "result in additional problems and should be done only with care and as a last resort") f = debugRecoverExecuteCmd.Flags() f.VarP(&debugRecoverExecuteOpts.Stores, cliflags.RecoverStore.Name, cliflags.RecoverStore.Shorthand, cliflags.RecoverStore.Usage()) diff --git a/pkg/cli/debug_recover_loss_of_quorum.go b/pkg/cli/debug_recover_loss_of_quorum.go index 46a8803ff42e..7080edb996da 100644 --- a/pkg/cli/debug_recover_loss_of_quorum.go +++ b/pkg/cli/debug_recover_loss_of_quorum.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/hintdetail" "github.com/spf13/cobra" ) @@ -264,6 +265,7 @@ var debugRecoverPlanOpts struct { outputFileName string deadStoreIDs []int confirmAction confirmActionFlag + force bool } func runDebugPlanReplicaRemoval(cmd *cobra.Command, args []string) error { @@ -302,28 +304,63 @@ Discarded live replicas: %d _, _ = fmt.Fprintf(stderr, "%s\n\n", deadStoreMsg) } - if len(plan.Updates) == 0 { - _, _ = fmt.Fprintf(stderr, "No recoverable ranges found.\n") - return nil + planningErr := report.Error() + if planningErr != nil { + // Need to warn user before they make a decision that ignoring + // inconsistencies is a really bad idea. + _, _ = fmt.Fprintf(stderr, + "Found replica inconsistencies:\n\n%s\n\nOnly proceed as a last resort!\n", + hintdetail.FlattenDetails(planningErr)) + } + + if debugRecoverPlanOpts.confirmAction == allNo { + return errors.New("abort") } switch debugRecoverPlanOpts.confirmAction { case prompt: - _, _ = fmt.Fprintf(stderr, "Proceed with plan creation [y/N] ") - reader := bufio.NewReader(os.Stdin) - line, err := reader.ReadString('\n') - if err != nil { - return errors.Wrap(err, "failed to read user input") + opts := "y/N" + if planningErr != nil { + opts = "f/N" } - _, _ = fmt.Fprintf(stderr, "\n") - if len(line) < 1 || (line[0] != 'y' && line[0] != 'Y') { - _, _ = fmt.Fprint(stderr, "Aborted at user request\n") - return nil + done := false + for !done { + _, _ = fmt.Fprintf(stderr, "Proceed with plan creation [%s] ", opts) + reader := bufio.NewReader(os.Stdin) + line, err := reader.ReadString('\n') + if err != nil { + return errors.Wrap(err, "failed to read user input") + } + line = strings.ToLower(strings.TrimSpace(line)) + if len(line) == 0 { + line = "n" + } + switch line { + case "y": + // We ignore y if we have errors. In that case you can only force or + // abandon attempt. + if planningErr != nil { + continue + } + done = true + case "f": + done = true + case "n": + return errors.New("abort") + } } case allYes: - // All actions enabled by default. + if planningErr != nil && !debugRecoverPlanOpts.force { + return errors.Errorf( + "can not create plan because of errors and no --force flag is given") + } default: - return errors.New("Aborted by --confirm option") + return errors.New("unexpected CLI error, try using different --confirm option value") + } + + if len(plan.Updates) == 0 { + _, _ = fmt.Fprintln(stderr, "Found no ranges in need of recovery, nothing to do.") + return nil } var writer io.Writer = os.Stdout diff --git a/pkg/kv/kvserver/loqrecovery/BUILD.bazel b/pkg/kv/kvserver/loqrecovery/BUILD.bazel index ee37ba00f2c0..3375db937b29 100644 --- a/pkg/kv/kvserver/loqrecovery/BUILD.bazel +++ b/pkg/kv/kvserver/loqrecovery/BUILD.bazel @@ -19,18 +19,22 @@ go_library( "//pkg/kv/kvserver/stateloader", "//pkg/roachpb:with-mocks", "//pkg/storage", + "//pkg/storage/enginepb", "//pkg/util/hlc", "//pkg/util/log", "//pkg/util/protoutil", "//pkg/util/timeutil", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", + "@io_etcd_go_etcd_raft_v3//raftpb", ], ) go_test( name = "loqrecovery_test", srcs = [ + "collect_raft_log_test.go", + "main_test.go", "record_test.go", "recovery_env_test.go", "recovery_test.go", @@ -38,18 +42,28 @@ go_test( data = glob(["testdata/**"]), embed = [":loqrecovery"], deps = [ + "//pkg/base", "//pkg/keys", "//pkg/kv/kvserver", + "//pkg/kv/kvserver/kvserverbase", "//pkg/kv/kvserver/kvserverpb", "//pkg/kv/kvserver/loqrecovery/loqrecoverypb", "//pkg/kv/kvserver/stateloader", "//pkg/roachpb:with-mocks", + "//pkg/security", + "//pkg/security/securitytest", + "//pkg/server", "//pkg/storage", "//pkg/storage/enginepb", "//pkg/testutils", + "//pkg/testutils/serverutils", + "//pkg/testutils/testcluster", "//pkg/util/hlc", "//pkg/util/keysutil", "//pkg/util/leaktest", + "//pkg/util/log", + "//pkg/util/protoutil", + "//pkg/util/randutil", "//pkg/util/timeutil", "//pkg/util/uuid", "@com_github_cockroachdb_datadriven//:datadriven", diff --git a/pkg/kv/kvserver/loqrecovery/collect.go b/pkg/kv/kvserver/loqrecovery/collect.go index 4b802f671ec7..efe22095bb8e 100644 --- a/pkg/kv/kvserver/loqrecovery/collect.go +++ b/pkg/kv/kvserver/loqrecovery/collect.go @@ -12,13 +12,19 @@ package loqrecovery import ( "context" + "math" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/errors" + "go.etcd.io/etcd/raft/v3/raftpb" ) // CollectReplicaInfo captures states of all replicas in all stores for the sake of quorum recovery. @@ -45,14 +51,24 @@ func CollectReplicaInfo( if err != nil { return err } + // Check raft log for un-applied range descriptor changes. We start from + // applied+1 (inclusive) and read until the end of the log. We also look + // at potentially uncommitted entries as we have no way to determine their + // outcome, and they will become committed as soon as the replica is + // designated as a survivor. + rangeUpdates, err := GetDescriptorChangesFromRaftLog(desc.RangeID, + rstate.RaftAppliedIndex+1, math.MaxInt64, reader) + if err != nil { + return err + } + replicaData := loqrecoverypb.ReplicaInfo{ - StoreID: storeIdent.StoreID, - NodeID: storeIdent.NodeID, - Desc: desc, - RaftAppliedIndex: rstate.RaftAppliedIndex, - RaftCommittedIndex: hstate.Commit, - // TODO(oleg): #73282 Track presence of uncommitted descriptors - HasUncommittedDescriptors: false, + StoreID: storeIdent.StoreID, + NodeID: storeIdent.NodeID, + Desc: desc, + RaftAppliedIndex: rstate.RaftAppliedIndex, + RaftCommittedIndex: hstate.Commit, + RaftLogDescriptorChanges: rangeUpdates, } replicas = append(replicas, replicaData) return nil @@ -62,3 +78,109 @@ func CollectReplicaInfo( } return loqrecoverypb.NodeReplicaInfo{Replicas: replicas}, nil } + +// GetDescriptorChangesFromRaftLog iterates over raft log between indicies +// lo (inclusive) and hi (exclusive) and searches for changes to range +// descriptor. Changes are identified by commit trigger content which is +// extracted either from EntryNormal where change updates key range info +// (split/merge) or from EntryConfChange* for changes in replica set. +func GetDescriptorChangesFromRaftLog( + rangeID roachpb.RangeID, lo, hi uint64, reader storage.Reader, +) ([]loqrecoverypb.DescriptorChangeInfo, error) { + var changes []loqrecoverypb.DescriptorChangeInfo + + key := keys.RaftLogKey(rangeID, lo) + endKey := keys.RaftLogKey(rangeID, hi) + iter := reader.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ + UpperBound: endKey, + }) + defer iter.Close() + + var meta enginepb.MVCCMetadata + var ent raftpb.Entry + + decodeRaftChange := func(ccI raftpb.ConfChangeI) ([]byte, error) { + var ccC kvserver.ConfChangeContext + if err := protoutil.Unmarshal(ccI.AsV2().Context, &ccC); err != nil { + return nil, errors.Wrap(err, "while unmarshaling CCContext") + } + return ccC.Payload, nil + } + + iter.SeekGE(storage.MakeMVCCMetadataKey(key)) + for ; ; iter.Next() { + ok, err := iter.Valid() + if err != nil { + return nil, err + } + if !ok { + return changes, nil + } + if err := protoutil.Unmarshal(iter.UnsafeValue(), &meta); err != nil { + return nil, errors.Wrap(err, "unable to decode raft log MVCCMetadata") + } + if err := storage.MakeValue(meta).GetProto(&ent); err != nil { + return nil, errors.Wrap(err, "unable to unmarshal raft Entry") + } + if len(ent.Data) == 0 { + continue + } + // Following code extracts our raft command from raft log entry. Depending + // on entry type we either need to extract encoded command from configuration + // change (for replica changes) or from normal command (for splits and + // merges). + var payload []byte + switch ent.Type { + case raftpb.EntryConfChange: + var cc raftpb.ConfChange + if err := protoutil.Unmarshal(ent.Data, &cc); err != nil { + return nil, errors.Wrap(err, "while unmarshaling ConfChange") + } + payload, err = decodeRaftChange(cc) + if err != nil { + return nil, err + } + case raftpb.EntryConfChangeV2: + var cc raftpb.ConfChangeV2 + if err := protoutil.Unmarshal(ent.Data, &cc); err != nil { + return nil, errors.Wrap(err, "while unmarshaling ConfChangeV2") + } + payload, err = decodeRaftChange(cc) + if err != nil { + return nil, err + } + case raftpb.EntryNormal: + _, payload = kvserver.DecodeRaftCommand(ent.Data) + default: + continue + } + if len(payload) == 0 { + continue + } + var raftCmd kvserverpb.RaftCommand + if err := protoutil.Unmarshal(payload, &raftCmd); err != nil { + return nil, errors.Wrap(err, "unable to unmarshal raft command") + } + switch { + case raftCmd.ReplicatedEvalResult.Split != nil: + changes = append(changes, + loqrecoverypb.DescriptorChangeInfo{ + ChangeType: loqrecoverypb.DescriptorChangeType_Split, + Desc: &raftCmd.ReplicatedEvalResult.Split.LeftDesc, + OtherDesc: &raftCmd.ReplicatedEvalResult.Split.RightDesc, + }) + case raftCmd.ReplicatedEvalResult.Merge != nil: + changes = append(changes, + loqrecoverypb.DescriptorChangeInfo{ + ChangeType: loqrecoverypb.DescriptorChangeType_Merge, + Desc: &raftCmd.ReplicatedEvalResult.Merge.LeftDesc, + OtherDesc: &raftCmd.ReplicatedEvalResult.Merge.RightDesc, + }) + case raftCmd.ReplicatedEvalResult.ChangeReplicas != nil: + changes = append(changes, loqrecoverypb.DescriptorChangeInfo{ + ChangeType: loqrecoverypb.DescriptorChangeType_ReplicaChange, + Desc: raftCmd.ReplicatedEvalResult.ChangeReplicas.Desc, + }) + } + } +} diff --git a/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go b/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go new file mode 100644 index 000000000000..a4f7bc03c0af --- /dev/null +++ b/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go @@ -0,0 +1,218 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package loqrecovery_test + +import ( + "context" + "math" + "reflect" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" +) + +// TestFindUpdateDescriptor verifies that we can detect changes to range +// descriptor in the raft log. +// To do this we split and merge the range which updates descriptor prior to +// spawning or subsuming RHS. +func TestFindUpdateDescriptor(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + + const testNode = 0 + + var testRangeID roachpb.RangeID + var rHS roachpb.RangeDescriptor + var lHSBefore roachpb.RangeDescriptor + var lHSAfter roachpb.RangeDescriptor + checkRaftLog(t, ctx, testNode, + func(ctx context.Context, tc *testcluster.TestCluster) roachpb.RKey { + scratchKey, err := tc.Server(0).ScratchRange() + require.NoError(t, err, "failed to get scratch range") + srk, err := keys.Addr(scratchKey) + require.NoError(t, err, "failed to resolve scratch key") + + rd, err := tc.LookupRange(scratchKey) + testRangeID = rd.RangeID + require.NoError(t, err, "failed to get descriptor for scratch range") + + splitKey := testutils.MakeKey(scratchKey, []byte("z")) + lHSBefore, rHS, err = tc.SplitRange(splitKey) + require.NoError(t, err, "failed to split scratch range") + + lHSAfter, err = tc.Servers[0].MergeRanges(scratchKey) + require.NoError(t, err, "failed to merge scratch range") + + require.NoError(t, + tc.Server(testNode).DB().Put(ctx, testutils.MakeKey(scratchKey, []byte("|first")), + "some data"), + "failed to put test value in LHS") + + return srk + }, + func(t *testing.T, ctx context.Context, reader storage.Reader) { + seq, err := loqrecovery.GetDescriptorChangesFromRaftLog(testRangeID, 0, math.MaxInt64, reader) + require.NoError(t, err, "failed to read raft log data") + + requireContainsDescriptor(t, loqrecoverypb.DescriptorChangeInfo{ + ChangeType: loqrecoverypb.DescriptorChangeType_Split, + Desc: &lHSBefore, + OtherDesc: &rHS, + }, seq) + requireContainsDescriptor(t, loqrecoverypb.DescriptorChangeInfo{ + ChangeType: loqrecoverypb.DescriptorChangeType_Merge, + Desc: &lHSAfter, + OtherDesc: &rHS, + }, seq) + }) +} + +// TestFindUpdateRaft verifies that we can detect raft change commands in the +// raft log. To do this we change number of replicas and then assert if +// RaftChange updates are found in event sequence. +func TestFindUpdateRaft(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + + const testNode = 0 + + var sRD roachpb.RangeDescriptor + checkRaftLog(t, ctx, testNode, + func(ctx context.Context, tc *testcluster.TestCluster) roachpb.RKey { + scratchKey, err := tc.Server(0).ScratchRange() + require.NoError(t, err, "failed to get scratch range") + srk, err := keys.Addr(scratchKey) + require.NoError(t, err, "failed to resolve scratch key") + rd, err := tc.AddVoters(scratchKey, tc.Target(1)) + require.NoError(t, err, "failed to upreplicate scratch range") + tc.TransferRangeLeaseOrFatal(t, rd, tc.Target(0)) + tc.RemoveVotersOrFatal(t, scratchKey, tc.Targets(1)...) + + sRD, err = tc.LookupRange(scratchKey) + require.NoError(t, err, "failed to get descriptor after remove replicas") + + require.NoError(t, + tc.Server(testNode).DB().Put(ctx, testutils.MakeKey(scratchKey, []byte("|first")), + "some data"), + "failed to put test value in range") + + return srk + }, + func(t *testing.T, ctx context.Context, reader storage.Reader) { + seq, err := loqrecovery.GetDescriptorChangesFromRaftLog(sRD.RangeID, 0, math.MaxInt64, reader) + require.NoError(t, err, "failed to read raft log data") + requireContainsDescriptor(t, loqrecoverypb.DescriptorChangeInfo{ + ChangeType: loqrecoverypb.DescriptorChangeType_ReplicaChange, + Desc: &sRD, + }, seq) + }) +} + +func checkRaftLog( + t *testing.T, + ctx context.Context, + nodeToMonitor int, + action func(ctx context.Context, tc *testcluster.TestCluster) roachpb.RKey, + assertRaftLog func(*testing.T, context.Context, storage.Reader), +) { + t.Helper() + + makeSnapshot := make(chan storage.Engine, 2) + snapshots := make(chan storage.Reader, 2) + + raftFilter := func(args kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) { + t.Helper() + select { + case store := <-makeSnapshot: + snapshots <- store.NewSnapshot() + default: + } + return 0, nil + } + + testRaftConfig := base.RaftConfig{ + // High enough interval to be longer than test but not overflow duration. + RaftTickInterval: math.MaxInt32, + RaftElectionTimeoutTicks: 1000000, + RaftLogTruncationThreshold: math.MaxInt64, + } + + tc := testcluster.NewTestCluster(t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{ + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + DisableGCQueue: true, + }, + }, + StoreSpecs: []base.StoreSpec{{InMemory: true}}, + RaftConfig: testRaftConfig, + Insecure: true, + }, + ReplicationMode: base.ReplicationManual, + ServerArgsPerNode: map[int]base.TestServerArgs{ + nodeToMonitor: { + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + TestingApplyFilter: raftFilter, + DisableGCQueue: true, + }, + }, + StoreSpecs: []base.StoreSpec{{InMemory: true}}, + RaftConfig: testRaftConfig, + Insecure: true, + }, + }, + }) + + tc.Start(t) + defer tc.Stopper().Stop(ctx) + + skey := action(ctx, tc) + + eng := tc.GetFirstStoreFromServer(t, nodeToMonitor).Engine() + makeSnapshot <- eng + // After the test action is complete raft might be completely caught up with + // its messages, so we will write a value into the range to ensure filter + // fires up at least once after we requested capture. + require.NoError(t, + tc.Server(0).DB().Put(ctx, testutils.MakeKey(skey, []byte("second")), "some data"), + "failed to put test value") + reader := <-snapshots + assertRaftLog(t, ctx, reader) +} + +func requireContainsDescriptor( + t *testing.T, value loqrecoverypb.DescriptorChangeInfo, seq []loqrecoverypb.DescriptorChangeInfo, +) { + t.Helper() + for _, v := range seq { + if reflect.DeepEqual(value, v) { + return + } + } + t.Fatalf("descriptor change sequence %v doesn't contain %v", seq, value) +} diff --git a/pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.proto b/pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.proto index c8145e494012..fe0427288a3e 100644 --- a/pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.proto +++ b/pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.proto @@ -15,6 +15,25 @@ option go_package = "loqrecoverypb"; import "roachpb/metadata.proto"; import "gogoproto/gogo.proto"; +enum DescriptorChangeType { + Split = 0; + Merge = 1; + ReplicaChange = 2; +} + +// DescriptorChangeInfo future descriptor change info extracted from commit +// triggers in raft log. +message DescriptorChangeInfo { + // Change type. + DescriptorChangeType changeType = 1; + // Range descriptor containing new state of for replica change updates and + // LHS for split and merge operations. + roachpb.RangeDescriptor desc = 2; + // Optional range descriptor with is populated for split and merge changes + // and contains RHS descriptor for the operation. + roachpb.RangeDescriptor otherDesc = 3; +} + // ReplicaInfo contains info about state of range replica for the purpose of range // recovery. This information should be enough for recovery algorithm to pick a // survivor replica in when not replicas are available. @@ -27,7 +46,8 @@ message ReplicaInfo { roachpb.RangeDescriptor desc = 3 [(gogoproto.nullable) = false]; uint64 raft_applied_index = 4; uint64 raft_committed_index = 5; - bool has_uncommitted_descriptors = 6; + repeated DescriptorChangeInfo raft_log_descriptor_changes = 6 [(gogoproto.nullable) = false, + (gogoproto.jsontag) = ",omitempty"]; } // Collection of replica information gathered from a collect-info run on a single node. diff --git a/pkg/kv/kvserver/loqrecovery/main_test.go b/pkg/kv/kvserver/loqrecovery/main_test.go new file mode 100644 index 000000000000..cae490f591d0 --- /dev/null +++ b/pkg/kv/kvserver/loqrecovery/main_test.go @@ -0,0 +1,31 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package loqrecovery_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +func TestMain(m *testing.M) { + security.SetAssetLoader(securitytest.EmbeddedAssets) + randutil.SeedForTests() + serverutils.InitTestServerFactory(server.TestServerFactory) + serverutils.InitTestClusterFactory(testcluster.TestClusterFactory) + os.Exit(m.Run()) +} diff --git a/pkg/kv/kvserver/loqrecovery/plan.go b/pkg/kv/kvserver/loqrecovery/plan.go index 45f8c86e6828..3a753ca7359c 100644 --- a/pkg/kv/kvserver/loqrecovery/plan.go +++ b/pkg/kv/kvserver/loqrecovery/plan.go @@ -77,6 +77,19 @@ type PlanningReport struct { // UpdatedNodes contains information about nodes with their stores where plan // needs to be applied. Stores are sorted in ascending order. UpdatedNodes map[roachpb.NodeID][]roachpb.StoreID + + // Problems contains any keyspace coverage problems + Problems []Problem +} + +// Error returns error if there are problems with the cluster that could make +// recovery "unsafe". Those errors could be ignored in dire situation and +// produce cluster that is partially unblocked but can have inconsistencies. +func (p PlanningReport) Error() error { + if len(p.Problems) > 0 { + return &RecoveryError{p.Problems} + } + return nil } // ReplicaUpdateReport contains detailed info about changes planned for @@ -100,6 +113,11 @@ type ReplicaUpdateReport struct { // lost. // Devised plan doesn't guarantee data consistency after the recovery, only // the fact that ranges could progress and subsequently perform up-replication. +// Moreover, if we discover conflicts in the range coverage or range descriptors +// they would be returned in the report, but that would not prevent us from +// making an "unsafe" recovery plan. +// An error is returned in case of unrecoverable error in the collected data +// that prevents creation of any sane plan or correctable user error. func PlanReplicas( ctx context.Context, nodes []loqrecoverypb.NodeReplicaInfo, deadStores []roachpb.StoreID, ) (loqrecoverypb.ReplicaUpdatePlan, PlanningReport, error) { @@ -123,7 +141,8 @@ func PlanReplicas( for _, rangeReplicas := range replicasByRangeID { proposedSurvivors = append(proposedSurvivors, rankReplicasBySurvivability(rangeReplicas)) } - if err = checkKeyspaceCovering(proposedSurvivors); err != nil { + problems, err := checkKeyspaceCovering(proposedSurvivors) + if err != nil { return loqrecoverypb.ReplicaUpdatePlan{}, PlanningReport{}, err } @@ -132,6 +151,7 @@ func PlanReplicas( report.TotalReplicas += len(p) u, ok := makeReplicaUpdateIfNeeded(ctx, p, availableStoreIDs) if ok { + problems = append(problems, checkDescriptor(p)...) plan = append(plan, u) report.DiscardedNonSurvivors += len(p) - 1 report.PlannedUpdates = append(report.PlannedUpdates, makeReplicaUpdateReport(ctx, p, u)) @@ -141,6 +161,11 @@ func PlanReplicas( log.Infof(ctx, "range r%d didn't lose quorum", p.rangeID()) } } + + sort.Slice(problems, func(i, j int) bool { + return problems[i].Span().Key.Compare(problems[j].Span().Key) < 0 + }) + report.Problems = problems report.UpdatedNodes = updatedLocations.asMapOfSlices() return loqrecoverypb.ReplicaUpdatePlan{Updates: plan}, report, nil } @@ -307,7 +332,7 @@ func rankReplicasBySurvivability(replicas []loqrecoverypb.ReplicaInfo) rankedRep // checkKeyspaceCovering given slice of all survivor ranges, checks that full // keyspace is covered. // Note that slice would be sorted in process of the check. -func checkKeyspaceCovering(replicas []rankedReplicas) error { +func checkKeyspaceCovering(replicas []rankedReplicas) ([]Problem, error) { sort.Slice(replicas, func(i, j int) bool { // We only need to sort replicas in key order to detect // key collisions or gaps, but if we have matching keys @@ -323,11 +348,11 @@ func checkKeyspaceCovering(replicas []rankedReplicas) error { } return false }) - var anomalies []keyspaceCoverageAnomaly + var problems []Problem prevDesc := rankedReplicas{{Desc: roachpb.RangeDescriptor{}}} // We validate that first range starts at min key, last range ends at max key // and that for every range start key is equal to end key of previous range. - // If any of those conditions fail, we record this as anomaly to indicate + // If any of those conditions fail, we record this as a problem to indicate // there's a gap between ranges or an overlap between two or more ranges. for _, rankedDescriptors := range replicas { // We need to take special care of the case where the survivor replica is @@ -337,7 +362,7 @@ func checkKeyspaceCovering(replicas []rankedReplicas) error { // then that would be a gap in keyspace coverage. r, err := rankedDescriptors.survivor().Replica() if err != nil { - return err + return nil, err } if !r.IsVoterNewConfig() { continue @@ -346,21 +371,19 @@ func checkKeyspaceCovering(replicas []rankedReplicas) error { case rankedDescriptors.startKey().Less(prevDesc.endKey()): start := keyMax(rankedDescriptors.startKey(), prevDesc.startKey()) end := keyMin(rankedDescriptors.endKey(), prevDesc.endKey()) - anomalies = append(anomalies, keyspaceCoverageAnomaly{ + problems = append(problems, keyspaceOverlap{ span: roachpb.Span{Key: roachpb.Key(start), EndKey: roachpb.Key(end)}, - overlap: true, range1: prevDesc.rangeID(), range1Span: prevDesc.span(), range2: rankedDescriptors.rangeID(), range2Span: rankedDescriptors.span(), }) case prevDesc.endKey().Less(rankedDescriptors.startKey()): - anomalies = append(anomalies, keyspaceCoverageAnomaly{ + problems = append(problems, keyspaceGap{ span: roachpb.Span{ Key: roachpb.Key(prevDesc.endKey()), EndKey: roachpb.Key(rankedDescriptors.startKey()), }, - overlap: false, range1: prevDesc.rangeID(), range1Span: prevDesc.span(), range2: rankedDescriptors.rangeID(), @@ -376,9 +399,8 @@ func checkKeyspaceCovering(replicas []rankedReplicas) error { } } if !prevDesc.endKey().Equal(roachpb.RKeyMax) { - anomalies = append(anomalies, keyspaceCoverageAnomaly{ + problems = append(problems, keyspaceGap{ span: roachpb.Span{Key: roachpb.Key(prevDesc.endKey()), EndKey: roachpb.KeyMax}, - overlap: false, range1: prevDesc.rangeID(), range1Span: prevDesc.span(), range2: roachpb.RangeID(0), @@ -386,10 +408,7 @@ func checkKeyspaceCovering(replicas []rankedReplicas) error { }) } - if len(anomalies) > 0 { - return &KeyspaceCoverageError{anomalies: anomalies} - } - return nil + return problems, nil } // makeReplicaUpdateIfNeeded if candidate range can't make progress, create an @@ -449,6 +468,51 @@ func makeReplicaUpdateIfNeeded( }, true } +// checkDescriptor analyses descriptor and raft log of surviving replica to find +// if its state is safe to perform recovery from. Currently only unapplied +// descriptor changes that either remove replica, or change KeySpan (splits or +// merges) are treated as unsafe. +func checkDescriptor(rankedDescriptors rankedReplicas) (problems []Problem) { + // We now need to analyze if range is unsafe to recover due to pending + // changes for the range descriptor in the raft log, but we only want to + // do that if range needs to be recovered. + for _, change := range rankedDescriptors.survivor().RaftLogDescriptorChanges { + switch change.ChangeType { + case loqrecoverypb.DescriptorChangeType_Split: + problems = append(problems, rangeSplit{ + rangeID: rankedDescriptors.rangeID(), + span: rankedDescriptors.span(), + rHSRangeID: change.OtherDesc.RangeID, + rHSRangeSpan: roachpb.Span{ + Key: roachpb.Key(change.OtherDesc.StartKey), + EndKey: roachpb.Key(change.OtherDesc.EndKey), + }, + }) + case loqrecoverypb.DescriptorChangeType_Merge: + problems = append(problems, rangeMerge{ + rangeID: rankedDescriptors.rangeID(), + span: rankedDescriptors.span(), + rHSRangeID: change.OtherDesc.RangeID, + rHSRangeSpan: roachpb.Span{ + Key: roachpb.Key(change.OtherDesc.StartKey), + EndKey: roachpb.Key(change.OtherDesc.EndKey), + }, + }) + case loqrecoverypb.DescriptorChangeType_ReplicaChange: + // Check if our own replica is being removed as part of descriptor + // change. + _, ok := change.Desc.GetReplicaDescriptor(rankedDescriptors.storeID()) + if !ok { + problems = append(problems, rangeReplicaRemoval{ + rangeID: rankedDescriptors.rangeID(), + span: rankedDescriptors.span(), + }) + } + } + } + return +} + // makeReplicaUpdateReport creates a detailed report of changes that needs to // be performed on range. It uses decision as well as information about all // replicas of range to provide information about what is being discarded and diff --git a/pkg/kv/kvserver/loqrecovery/recovery_env_test.go b/pkg/kv/kvserver/loqrecovery/recovery_env_test.go index fe97a0b65c2b..c2f6501246f6 100644 --- a/pkg/kv/kvserver/loqrecovery/recovery_env_test.go +++ b/pkg/kv/kvserver/loqrecovery/recovery_env_test.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" @@ -29,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/keysutil" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/datadriven" @@ -52,13 +54,28 @@ type testReplicaInfo struct { Generation roachpb.RangeGeneration `yaml:"Generation,omitempty"` // Raft state. - RangeAppliedIndex uint64 `yaml:"RangeAppliedIndex"` - RaftCommittedIndex uint64 `yaml:"RaftCommittedIndex"` - HasUncommittedDescriptors bool `yaml:"HasUncommittedDescriptors"` + RangeAppliedIndex uint64 `yaml:"RangeAppliedIndex"` + RaftCommittedIndex uint64 `yaml:"RaftCommittedIndex"` + DescriptorUpdates []testReplicaDescriptorChange `yaml:"DescriptorUpdates,flow,omitempty"` // TODO(oleg): Add ability to have descriptor intents in the store for testing purposes } +// Raft log descriptor changes. +type testReplicaDescriptorChange struct { + // Change type determines which fields has to be set here. While this could be + // error-prone, we use this for the sake of test spec brevity. Otherwise + // we will need nested structures which would introduce clutter. + ChangeType loqrecoverypb.DescriptorChangeType `yaml:"Type"` + // Replicas are used to define descriptor change (Type = 2). + Replicas []replicaDescriptorView `yaml:"Replicas,flow,omitempty"` + // RangeID, StartKey and EndKey define right-hand side of split and merge + // operations (Type = 0 or Type = 1). + RangeID roachpb.RangeID `yaml:"RangeID,omitempty"` + StartKey string `yaml:"StartKey,omitempty"` + EndKey string `yaml:"EndKey,omitempty"` +} + type storeView struct { NodeID roachpb.NodeID `yaml:"NodeID"` StoreID roachpb.StoreID `yaml:"StoreID"` @@ -93,6 +110,15 @@ type replicaDescriptorView struct { ReplicaType *roachpb.ReplicaType `yaml:"ReplicaType,omitempty"` } +func (r replicaDescriptorView) asReplicaDescriptor() roachpb.ReplicaDescriptor { + return roachpb.ReplicaDescriptor{ + NodeID: r.NodeID, + StoreID: r.StoreID, + ReplicaID: r.ReplicaID, + Type: r.ReplicaType, + } +} + // Store with its owning NodeID for easier grouping by owning nodes. type wrappedStore struct { engine storage.Engine @@ -139,7 +165,7 @@ func (e *quorumRecoveryEnv) Handle(t *testing.T, d datadriven.TestData) string { // from presentation. details := errors.GetAllDetails(err) if len(details) > 0 { - return fmt.Sprintf("ERROR: %s", strings.Join(details, "\n")) + return fmt.Sprintf("ERROR: %s\n%s", err.Error(), strings.Join(details, "\n")) } return fmt.Sprintf("ERROR: %s", err.Error()) } @@ -169,7 +195,7 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes replica.NodeID = roachpb.NodeID(replica.StoreID) } - key, desc, replicaState, hardState := buildReplicaDescriptorFromTestData(t, replica) + key, desc, replicaState, hardState, raftLog := buildReplicaDescriptorFromTestData(t, replica) eng := e.getOrCreateStore(ctx, t, replica.StoreID, replica.NodeID) if err = storage.MVCCPutProto(ctx, eng, nil, key, clock.Now(), nil, /* txn */ @@ -184,13 +210,29 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes if err := sl.SetHardState(ctx, eng, hardState); err != nil { t.Fatalf("failed to save raft hard state: %v", err) } + for i, entry := range raftLog { + value, err := protoutil.Marshal(&entry) + if err != nil { + t.Fatalf("failed to serialize metadata entry for raft log") + } + if err := eng.PutUnversioned(keys.RaftLogKey(replica.RangeID, + uint64(i)+hardState.Commit+1), value); err != nil { + t.Fatalf("failed to insert raft log entry into store: %s", err) + } + } } return "ok" } func buildReplicaDescriptorFromTestData( t *testing.T, replica testReplicaInfo, -) (roachpb.Key, roachpb.RangeDescriptor, kvserverpb.ReplicaState, raftpb.HardState) { +) ( + roachpb.Key, + roachpb.RangeDescriptor, + kvserverpb.ReplicaState, + raftpb.HardState, + []enginepb.MVCCMetadata, +) { clock := hlc.NewClock(hlc.UnixNano, time.Millisecond*100) startKey := parsePrettyKey(t, replica.StartKey) @@ -202,12 +244,7 @@ func buildReplicaDescriptorFromTestData( if r.ReplicaID > maxReplicaID { maxReplicaID = r.ReplicaID } - replicas = append(replicas, roachpb.ReplicaDescriptor{ - NodeID: r.NodeID, - StoreID: r.StoreID, - ReplicaID: r.ReplicaID, - Type: r.ReplicaType, - }) + replicas = append(replicas, r.asReplicaDescriptor()) } if replica.Generation == 0 { replica.Generation = roachpb.RangeGeneration(maxReplicaID) @@ -250,7 +287,79 @@ func buildReplicaDescriptorFromTestData( Vote: 0, Commit: replica.RaftCommittedIndex, } - return key, desc, replicaState, hardState + var raftLog []enginepb.MVCCMetadata + for i, u := range replica.DescriptorUpdates { + entry := raftLogFromPendingDescriptorUpdate(t, replica, u, desc, uint64(i)) + raftLog = append(raftLog, enginepb.MVCCMetadata{RawBytes: entry.RawBytes}) + } + return key, desc, replicaState, hardState, raftLog +} + +func raftLogFromPendingDescriptorUpdate( + t *testing.T, + replica testReplicaInfo, + update testReplicaDescriptorChange, + desc roachpb.RangeDescriptor, + entryIndex uint64, +) roachpb.Value { + // We mimic EndTxn messages with commit triggers here. We don't construct + // full batches with descriptor updates as we only need data that would be + // used by collect stage. Actual data extraction from raft log is tested + // elsewhere using test cluster. + r := kvserverpb.ReplicatedEvalResult{} + switch update.ChangeType { + case loqrecoverypb.DescriptorChangeType_Split: + r.Split = &kvserverpb.Split{ + SplitTrigger: roachpb.SplitTrigger{ + RightDesc: roachpb.RangeDescriptor{ + RangeID: update.RangeID, + StartKey: roachpb.RKey(update.StartKey), + EndKey: roachpb.RKey(update.EndKey), + }, + }, + } + case loqrecoverypb.DescriptorChangeType_Merge: + r.Merge = &kvserverpb.Merge{ + MergeTrigger: roachpb.MergeTrigger{ + RightDesc: roachpb.RangeDescriptor{ + RangeID: update.RangeID, + StartKey: roachpb.RKey(update.StartKey), + EndKey: roachpb.RKey(update.EndKey), + }, + }, + } + case loqrecoverypb.DescriptorChangeType_ReplicaChange: + var newReplicas []roachpb.ReplicaDescriptor + for _, r := range update.Replicas { + newReplicas = append(newReplicas, r.asReplicaDescriptor()) + } + r.ChangeReplicas = &kvserverpb.ChangeReplicas{ + ChangeReplicasTrigger: roachpb.ChangeReplicasTrigger{ + Desc: &roachpb.RangeDescriptor{ + RangeID: desc.RangeID, + InternalReplicas: newReplicas, + }, + }, + } + } + raftCmd := kvserverpb.RaftCommand{ReplicatedEvalResult: r} + out, err := protoutil.Marshal(&raftCmd) + if err != nil { + t.Fatalf("failed to serialize raftCommand: %v", err) + } + data := kvserver.EncodeTestRaftCommand( + out, kvserverbase.CmdIDKey(fmt.Sprintf("%08d", entryIndex))) + ent := raftpb.Entry{ + Term: 1, + Index: replica.RaftCommittedIndex + entryIndex, + Type: raftpb.EntryNormal, + Data: data, + } + var value roachpb.Value + if err := value.SetProto(&ent); err != nil { + t.Fatalf("can't construct raft entry from test data: %s", err) + } + return value } func parsePrettyKey(t *testing.T, pretty string) roachpb.RKey { @@ -263,12 +372,20 @@ func parsePrettyKey(t *testing.T, pretty string) roachpb.RKey { } func (e *quorumRecoveryEnv) handleMakePlan(t *testing.T, d datadriven.TestData) (string, error) { - var err error stores := e.parseStoresArg(t, d, false /* defaultToAll */) - e.plan, _, err = PlanReplicas(context.Background(), e.replicas, stores) + plan, report, err := PlanReplicas(context.Background(), e.replicas, stores) if err != nil { return "", err } + err = report.Error() + var force bool + if d.HasArg("force") { + d.ScanArgs(t, "force", &force) + } + if err != nil && !force { + return "", err + } + e.plan = plan // We only marshal actual data without container to reduce clutter. out, err := yaml.Marshal(e.plan.Updates) if err != nil { diff --git a/pkg/kv/kvserver/loqrecovery/testdata/force_inconsistent_plans b/pkg/kv/kvserver/loqrecovery/testdata/force_inconsistent_plans new file mode 100644 index 000000000000..dcad4cf737bc --- /dev/null +++ b/pkg/kv/kvserver/loqrecovery/testdata/force_inconsistent_plans @@ -0,0 +1,182 @@ +# Tests verifying that force would allow creation of plans where some of the ranges are violate +# safety criteria like keyspace coverage or presence of descriptor changes in raft log. + +# Check that range with pending descriptor change could be forced to become a survivor. +replication-data +- StoreID: 1 + RangeID: 1 # This range lost quorum and could be recovered, we want to ensure that this recovery + # proceeds as normal along with forced recovery for subsequent range. + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 3, StoreID: 3, ReplicaID: 2} + - { NodeID: 4, StoreID: 4, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 2 # This is the only surviving replica but it has an unapplied descriptor in raft log + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 14 + DescriptorUpdates: # pending updates to descriptor should raise make plan unsafe to proceed + - Type: 2 + Replicas: + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} +---- +ok + +collect-replica-info stores=(1) +---- +ok + +make-plan +---- +ERROR: loss of quorum recovery error +range has unapplied descriptor change that removes current replica + r2: /{Table/1-Max} + +make-plan force=true +---- +- RangeID: 1 + StartKey: /Min + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 14 + NextReplicaID: 15 +- RangeID: 2 + StartKey: /Table/1 + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 14 + NextReplicaID: 15 + +# Check that with a range gap between two recoverable replicas could be forced to proceed. +# Range 2 is missing but ranges 1 and 3 still need to recover quorum. Forcing plan should +# do partial recovery. +replication-data +- StoreID: 1 + RangeID: 1 + StartKey: /Min + EndKey: /Table/3 # first range ends short of the second one leaving a missing [Table/3, Table/4) + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 3 + StartKey: /Table/4 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 13 +---- +ok + +collect-replica-info stores=(1) +---- +ok + +make-plan +---- +ERROR: loss of quorum recovery error +range gap /Table/{3-4} + r1: /{Min-Table/3} + r3: /{Table/4-Max} + +make-plan force=true +---- +- RangeID: 1 + StartKey: /Min + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 14 + NextReplicaID: 15 +- RangeID: 3 + StartKey: /Table/4 + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 14 + NextReplicaID: 15 + + +# Check that cluster with a range gap between two healthy replicas could be forced to proceed and produce +# an empty recovery plan since there's nothing left to recover. +replication-data +- StoreID: 1 + RangeID: 1 + StartKey: /Min + EndKey: /Table/3 # first range ends short of the second one leaving a missing [Table/3, Table/4) + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 3 + StartKey: /Table/4 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 13 +- StoreID: 2 + RangeID: 1 + StartKey: /Min + EndKey: /Table/3 # first range ends short of the second one leaving a missing [Table/3, Table/4) + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 13 +- StoreID: 2 + RangeID: 3 + StartKey: /Table/4 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 13 +---- +ok + +collect-replica-info stores=(1,2) +---- +ok + +make-plan +---- +ERROR: loss of quorum recovery error +range gap /Table/{3-4} + r1: /{Min-Table/3} + r3: /{Table/4-Max} + +make-plan force=true +---- +[] diff --git a/pkg/kv/kvserver/loqrecovery/testdata/keyspace_coverage b/pkg/kv/kvserver/loqrecovery/testdata/keyspace_coverage index b75513b05cda..6db8d56e3d6e 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/keyspace_coverage +++ b/pkg/kv/kvserver/loqrecovery/testdata/keyspace_coverage @@ -43,7 +43,7 @@ ok make-plan ---- -ERROR: Key space covering is not complete. Discovered following inconsistencies: +ERROR: loss of quorum recovery error range overlap /{Min-Table/3} r1: /{Min-Table/3} r3: /{Min-Table/3} @@ -80,7 +80,7 @@ ok make-plan ---- -ERROR: Key space covering is not complete. Discovered following inconsistencies: +ERROR: loss of quorum recovery error range gap /Table/{3-4} r1: /{Min-Table/3} r3: /{Table/4-Max} @@ -136,7 +136,7 @@ ok make-plan ---- -ERROR: Key space covering is not complete. Discovered following inconsistencies: +ERROR: loss of quorum recovery error range overlap /Table/{1-3} r1: /{Min-Table/10} r3: /Table/{1-3} @@ -167,7 +167,7 @@ ok make-plan ---- -ERROR: Key space covering is not complete. Discovered following inconsistencies: +ERROR: loss of quorum recovery error range gap /{Min-Table/1} r0: /Min r1: /Table/{1-99} @@ -187,8 +187,7 @@ ok make-plan ---- -ERROR: Key space covering is not complete. Discovered following inconsistencies: +ERROR: loss of quorum recovery error range gap /M{in-ax} r0: /Min r0: /Max{-} - diff --git a/pkg/kv/kvserver/loqrecovery/testdata/learners_lose b/pkg/kv/kvserver/loqrecovery/testdata/learners_lose index 1c35ab5089f7..58ca31c21ca5 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/learners_lose +++ b/pkg/kv/kvserver/loqrecovery/testdata/learners_lose @@ -115,7 +115,7 @@ ok make-plan ---- -ERROR: Key space covering is not complete. Discovered following inconsistencies: +ERROR: loss of quorum recovery error range gap /{Min-Table/1} r0: /Min r2: /{Table/1-Max} diff --git a/pkg/kv/kvserver/loqrecovery/testdata/pending_descriptor_changes b/pkg/kv/kvserver/loqrecovery/testdata/pending_descriptor_changes new file mode 100644 index 000000000000..e7c414d945a5 --- /dev/null +++ b/pkg/kv/kvserver/loqrecovery/testdata/pending_descriptor_changes @@ -0,0 +1,279 @@ +# Tests verifying handling of pending descriptor changes from raft log + +# Check that ranges with pending descriptor changes where the change removes current replica are +# unsafe to proceed with. +replication-data +- StoreID: 1 + RangeID: 1 + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 2 + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 3, StoreID: 3, ReplicaID: 2} + - { NodeID: 4, StoreID: 4, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 14 + DescriptorUpdates: # pending replica updates in the descriptor would make it unsafe to proceed + - Type: 2 + Replicas: + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} +- StoreID: 2 + RangeID: 1 + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 10 +---- +ok + +collect-replica-info stores=(1,2) +---- +ok + +make-plan +---- +ERROR: loss of quorum recovery error +range has unapplied descriptor change that removes current replica + r2: /{Table/1-Max} + + +# Check that ranges with pending descriptor with split or merge are blocking plan creation +replication-data +- StoreID: 1 + RangeID: 1 + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 2 # replica with uncommitted descriptor updates + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} + - { NodeID: 5, StoreID: 5, ReplicaID: 5} + RangeAppliedIndex: 11 # this is preferred replica with higher applied index + RaftCommittedIndex: 14 + DescriptorUpdates: + - { Type: 0, RangeID: 3, StartKey: /Table/2, EndKey: /Max} + - { Type: 1, RangeID: 3, StartKey: /Table/2, EndKey: /Max} +- StoreID: 2 + RangeID: 2 + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} + - { NodeID: 5, StoreID: 5, ReplicaID: 5} + RangeAppliedIndex: 10 # applied index takes precedence over store ID so this replica loses + RaftCommittedIndex: 10 # committed index while higher, should not confuse planner and use applied index +---- +ok + +collect-replica-info stores=(1,2) +---- +ok + +make-plan +---- +ERROR: loss of quorum recovery error +range has unapplied split operation + r2, /{Table/1-Max} rhs r3, /{Table/2-Max} +range has unapplied merge operation + r2, /{Table/1-Max} with r3, /{Table/2-Max} + + +# Check that ranges with pending descriptor changes where the change removes other replicas are +# considered safe to proceed with. +replication-data +- StoreID: 1 + RangeID: 1 + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 2 + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 14 + DescriptorUpdates: + - Type: 2 # pending descriptor update where replicas 2 3 are replaced with 4 which is considered safe + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} +---- +ok + +collect-replica-info stores=(1) +---- +ok + +make-plan +---- +- RangeID: 1 + StartKey: /Min + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 14 + NextReplicaID: 15 +- RangeID: 2 + StartKey: /Table/1 + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 14 + NextReplicaID: 15 + + +# Check that if descriptor didn't lose quorum, we should not fail if raft log contains future changes +# that we think is unsafe. We should leave descriptor as-it. +replication-data +- StoreID: 1 + RangeID: 1 # healthy range to maintain keyspace coverage + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 13 +- StoreID: 1 + RangeID: 2 # range which still has quorum, but unsafe changes for descriptor in raft log + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 14 + DescriptorUpdates: + - Type: 2 # pending descriptor update where replicas 1 is replaced with 5 which is unsafe + Replicas: + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 5, StoreID: 5, ReplicaID: 5} +- StoreID: 2 + RangeID: 1 # healthy range to maintain keyspace coverage + StartKey: /Min + EndKey: /Table/1 + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 11 + RaftCommittedIndex: 13 +- StoreID: 2 + RangeID: 2 + StartKey: /Table/1 + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + RangeAppliedIndex: 10 + RaftCommittedIndex: 10 # this descriptor doesn't know the changes yet, but it maintains quorum +---- +ok + +collect-replica-info stores=(1,2) +---- +ok + +make-plan +---- +[] + + +# Check that if non survivor replica has pending changes, it doesn't prevent recovery from +# proceeding. +replication-data +- StoreID: 1 + RangeID: 1 # this range lost quorum and other replica is not up to date with descriptor + # but it shouldn't prevent this descriptor from becoming designated as survivor + StartKey: /Min + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} + - { NodeID: 5, StoreID: 5, ReplicaID: 5} + - { NodeID: 6, StoreID: 6, ReplicaID: 6} + RangeAppliedIndex: 15 + RaftCommittedIndex: 15 +- StoreID: 2 + RangeID: 1 + StartKey: /Min + EndKey: /Max + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 2, StoreID: 2, ReplicaID: 2} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} + - { NodeID: 5, StoreID: 5, ReplicaID: 5} + RangeAppliedIndex: 10 + RaftCommittedIndex: 15 # This descriptor didn't apply the changes yet + DescriptorUpdates: + - Type: 2 # Pending descriptor update where replica 2 is replaced with 6 which is unsafe + Replicas: + - { NodeID: 1, StoreID: 1, ReplicaID: 1} + - { NodeID: 3, StoreID: 3, ReplicaID: 3} + - { NodeID: 4, StoreID: 4, ReplicaID: 4} + - { NodeID: 5, StoreID: 5, ReplicaID: 5} + - { NodeID: 6, StoreID: 6, ReplicaID: 6} +---- +ok + +collect-replica-info stores=(1,2) +---- +ok + +make-plan +---- +- RangeID: 1 + StartKey: /Min + OldReplicaID: 1 + NewReplica: + NodeID: 1 + StoreID: 1 + ReplicaID: 17 + NextReplicaID: 18 diff --git a/pkg/kv/kvserver/loqrecovery/utils.go b/pkg/kv/kvserver/loqrecovery/utils.go index ac9ca119b245..07342b7b1ca8 100644 --- a/pkg/kv/kvserver/loqrecovery/utils.go +++ b/pkg/kv/kvserver/loqrecovery/utils.go @@ -55,14 +55,24 @@ func keyMin(key1 roachpb.RKey, key2 roachpb.RKey) roachpb.RKey { return key1 } -// keyspaceCoverageAnomaly records errors found when checking keyspace coverage. -// Anomaly is a key span where there's no coverage or there are more than one -// range that covers the span. -// Anomaly also contains additional information about ranges that either -// bordering the gap or overlap over the anomaly span. -type keyspaceCoverageAnomaly struct { - span roachpb.Span - overlap bool +// Problem records errors found when checking keyspace coverage and +// health of survivor replicas. Problem covers a key span that is either not +// covered by any ranges, covered multiple times or correspond to replicas data +// in which could not act as a source of truth in absence of other replicas. +// Main goal of this interface to provide a human-readable representations of +// problem discovered during planning process. +// Problems contain span so that they could be ordered for user presentations. +// Problem also contains additional information about ranges that either +// bordering the gap or overlap over the problematic span. +type Problem interface { + fmt.Stringer + // Span returns span for detected problem. Problems should report consistent + // span for the sake of ordered data presentation. + Span() roachpb.Span +} + +type keyspaceGap struct { + span roachpb.Span range1 roachpb.RangeID range1Span roachpb.Span @@ -71,34 +81,94 @@ type keyspaceCoverageAnomaly struct { range2Span roachpb.Span } -func (i keyspaceCoverageAnomaly) String() string { - if i.overlap { - return fmt.Sprintf("range overlap %v\n r%d: %v\n r%d: %v", - i.span, i.range1, i.range1Span, i.range2, i.range2Span) - } +func (i keyspaceGap) String() string { return fmt.Sprintf("range gap %v\n r%d: %v\n r%d: %v", i.span, i.range1, i.range1Span, i.range2, i.range2Span) } -// KeyspaceCoverageError is returned by replica planner when it detects problems -// with key coverage. Error contains all anomalies found. It also provides a -// convenience function to print report. -type KeyspaceCoverageError struct { - anomalies []keyspaceCoverageAnomaly +func (i keyspaceGap) Span() roachpb.Span { + return i.span +} + +type keyspaceOverlap struct { + span roachpb.Span + + range1 roachpb.RangeID + range1Span roachpb.Span + + range2 roachpb.RangeID + range2Span roachpb.Span +} + +func (i keyspaceOverlap) String() string { + return fmt.Sprintf("range overlap %v\n r%d: %v\n r%d: %v", + i.span, i.range1, i.range1Span, i.range2, i.range2Span) +} + +func (i keyspaceOverlap) Span() roachpb.Span { + return i.span +} + +type rangeSplit struct { + rangeID roachpb.RangeID + span roachpb.Span + + rHSRangeID roachpb.RangeID + rHSRangeSpan roachpb.Span +} + +func (i rangeSplit) String() string { + return fmt.Sprintf("range has unapplied split operation\n r%d, %v rhs r%d, %v", + i.rangeID, i.span, i.rHSRangeID, i.rHSRangeSpan) +} + +func (i rangeSplit) Span() roachpb.Span { + return i.span +} + +type rangeMerge rangeSplit + +func (i rangeMerge) String() string { + return fmt.Sprintf("range has unapplied merge operation\n r%d, %v with r%d, %v", + i.rangeID, i.span, i.rHSRangeID, i.rHSRangeSpan) +} + +func (i rangeMerge) Span() roachpb.Span { + return i.span +} + +type rangeReplicaRemoval struct { + rangeID roachpb.RangeID + span roachpb.Span +} + +func (i rangeReplicaRemoval) String() string { + return fmt.Sprintf("range has unapplied descriptor change that removes current replica\n r%d: %v", + i.rangeID, + i.span) +} + +func (i rangeReplicaRemoval) Span() roachpb.Span { + return i.span +} + +// RecoveryError is returned by replica planner when it detects problems +// with replicas in key space. Error contains all problems found in keyspace. +// RecoveryError implements ErrorDetailer to integrate into cli commands. +type RecoveryError struct { + problems []Problem } -func (e *KeyspaceCoverageError) Error() string { - return "keyspace coverage error" +func (e *RecoveryError) Error() string { + return "loss of quorum recovery error" } // ErrorDetail returns a properly formatted report that could be presented // to user. -func (e *KeyspaceCoverageError) ErrorDetail() string { - descriptions := make([]string, 0, len(e.anomalies)) - for _, id := range e.anomalies { +func (e *RecoveryError) ErrorDetail() string { + descriptions := make([]string, 0, len(e.problems)) + for _, id := range e.problems { descriptions = append(descriptions, fmt.Sprintf("%v", id)) } - return fmt.Sprintf( - "Key space covering is not complete. Discovered following inconsistencies:\n%s\n", - strings.Join(descriptions, "\n")) + return strings.Join(descriptions, "\n") } diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go index 340132cefd1e..0c7b8605ecfc 100644 --- a/pkg/kv/kvserver/replica_raftstorage.go +++ b/pkg/kv/kvserver/replica_raftstorage.go @@ -1300,3 +1300,12 @@ func DecodeRaftCommand(data []byte) (kvserverbase.CmdIDKey, []byte) { } return kvserverbase.CmdIDKey(data[1 : 1+raftCommandIDLen]), data[1+raftCommandIDLen:] } + +// EncodeTestRaftCommand encodes provided command as raft command by adding +// appropriate prefix to the data. It is only used for testing to create test +// raft log entries. No correctness guarantees are provided. +// +// See: #75729 +func EncodeTestRaftCommand(command []byte, commandID kvserverbase.CmdIDKey) []byte { + return encodeRaftCommand(raftVersionStandard, commandID, command) +}