From 00e7fdf4719a92d12f208e64e90f96ce54d600d0 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 25 Apr 2022 12:17:07 +0000 Subject: [PATCH] kvserver/loqrecovery: persist new replica ID in `RaftReplicaID` The recently introduced local `RaftReplicaIDKey` was not updated when loss of quorum recovery changed the replica's ID. This could lead to assertion failures. Release note: None --- pkg/kv/kvserver/loqrecovery/apply.go | 5 +++ .../kvserver/loqrecovery/recovery_env_test.go | 32 +++++++++++++++++-- .../loqrecovery/testdata/learners_lose | 6 ++++ .../testdata/max_applied_voter_wins | 24 ++++++++++++++ .../loqrecovery/testdata/max_store_voter_wins | 6 ++++ .../testdata/no_change_when_no_dead_peers | 9 ++++++ .../testdata/no_change_when_quorum | 6 ++++ 7 files changed, 86 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvserver/loqrecovery/apply.go b/pkg/kv/kvserver/loqrecovery/apply.go index 5b6eecfa7abd..b5eb2310ebe5 100644 --- a/pkg/kv/kvserver/loqrecovery/apply.go +++ b/pkg/kv/kvserver/loqrecovery/apply.go @@ -293,6 +293,11 @@ func applyReplicaUpdate( report.OldReplica, _ = report.RemovedReplicas.RemoveReplica( update.NewReplica.NodeID, update.NewReplica.StoreID) + // Persist the new replica ID. + if err := sl.SetRaftReplicaID(ctx, readWriter, update.NewReplica.ReplicaID); err != nil { + return PrepareReplicaReport{}, errors.Wrap(err, "setting new replica ID") + } + // Refresh stats if err := sl.SetMVCCStats(ctx, readWriter, &ms); err != nil { return PrepareReplicaReport{}, errors.Wrap(err, "updating MVCCStats") diff --git a/pkg/kv/kvserver/loqrecovery/recovery_env_test.go b/pkg/kv/kvserver/loqrecovery/recovery_env_test.go index ecb99fb8b3ae..06f42716a8fc 100644 --- a/pkg/kv/kvserver/loqrecovery/recovery_env_test.go +++ b/pkg/kv/kvserver/loqrecovery/recovery_env_test.go @@ -81,6 +81,7 @@ type storeView struct { StoreID roachpb.StoreID `yaml:"StoreID"` Descriptors []storeDescriptorView `yaml:"Descriptors"` + LocalData []localDataView `yaml:"LocalData"` } // storeDescriptorView contains important fields from the range @@ -119,6 +120,12 @@ func (r replicaDescriptorView) asReplicaDescriptor() roachpb.ReplicaDescriptor { } } +// localDataView contains interesting local store data for each range. +type localDataView struct { + RangeID roachpb.RangeID `yaml:"RangeID"` + RaftReplicaID int `yaml:"RaftReplicaID"` +} + // Store with its owning NodeID for easier grouping by owning nodes. type wrappedStore struct { engine storage.Engine @@ -195,7 +202,8 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes replica.NodeID = roachpb.NodeID(replica.StoreID) } - key, desc, replicaState, hardState, raftLog := buildReplicaDescriptorFromTestData(t, replica) + replicaID, key, desc, replicaState, hardState, raftLog := + buildReplicaDescriptorFromTestData(t, replica) eng := e.getOrCreateStore(ctx, t, replica.StoreID, replica.NodeID) if err = storage.MVCCPutProto(ctx, eng, nil, key, clock.Now(), nil, /* txn */ @@ -210,6 +218,9 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes if err := sl.SetHardState(ctx, eng, hardState); err != nil { t.Fatalf("failed to save raft hard state: %v", err) } + if err := sl.SetRaftReplicaID(ctx, eng, replicaID); err != nil { + t.Fatalf("failed to set raft replica ID: %v", err) + } for i, entry := range raftLog { value, err := protoutil.Marshal(&entry) if err != nil { @@ -227,6 +238,7 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes func buildReplicaDescriptorFromTestData( t *testing.T, replica testReplicaInfo, ) ( + roachpb.ReplicaID, roachpb.Key, roachpb.RangeDescriptor, kvserverpb.ReplicaState, @@ -239,12 +251,16 @@ func buildReplicaDescriptorFromTestData( endKey := parsePrettyKey(t, replica.EndKey) key := keys.RangeDescriptorKey(startKey) var replicas []roachpb.ReplicaDescriptor + var replicaID roachpb.ReplicaID maxReplicaID := replica.Replicas[0].ReplicaID for _, r := range replica.Replicas { if r.ReplicaID > maxReplicaID { maxReplicaID = r.ReplicaID } replicas = append(replicas, r.asReplicaDescriptor()) + if r.NodeID == replica.NodeID && r.StoreID == replica.StoreID { + replicaID = r.ReplicaID + } } if replica.Generation == 0 { replica.Generation = roachpb.RangeGeneration(maxReplicaID) @@ -292,7 +308,7 @@ func buildReplicaDescriptorFromTestData( entry := raftLogFromPendingDescriptorUpdate(t, replica, u, desc, uint64(i)) raftLog = append(raftLog, enginepb.MVCCMetadata{RawBytes: entry.RawBytes}) } - return key, desc, replicaState, hardState, raftLog + return replicaID, key, desc, replicaState, hardState, raftLog } func raftLogFromPendingDescriptorUpdate( @@ -520,10 +536,21 @@ func (e *quorumRecoveryEnv) handleDumpStore(t *testing.T, d datadriven.TestData) var storesView []storeView for _, storeID := range stores { var descriptorViews []storeDescriptorView + var localDataViews []localDataView store := e.stores[storeID] err := kvserver.IterateRangeDescriptorsFromDisk(ctx, store.engine, func(desc roachpb.RangeDescriptor) error { descriptorViews = append(descriptorViews, descriptorView(desc)) + + sl := stateloader.Make(desc.RangeID) + raftReplicaID, _, err := sl.LoadRaftReplicaID(ctx, store.engine) + if err != nil { + t.Fatalf("failed to load Raft replica ID: %v", err) + } + localDataViews = append(localDataViews, localDataView{ + RangeID: desc.RangeID, + RaftReplicaID: int(raftReplicaID.ReplicaID), + }) return nil }) if err != nil { @@ -533,6 +560,7 @@ func (e *quorumRecoveryEnv) handleDumpStore(t *testing.T, d datadriven.TestData) NodeID: e.stores[storeID].nodeID, StoreID: storeID, Descriptors: descriptorViews, + LocalData: localDataViews, }) } out, err := yaml.Marshal(storesView) diff --git a/pkg/kv/kvserver/loqrecovery/testdata/learners_lose b/pkg/kv/kvserver/loqrecovery/testdata/learners_lose index 58ca31c21ca5..5379afc6c88a 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/learners_lose +++ b/pkg/kv/kvserver/loqrecovery/testdata/learners_lose @@ -61,6 +61,9 @@ dump-store stores=(1,2) StartKey: /Min Replicas: - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 16} + LocalData: + - RangeID: 1 + RaftReplicaID: 16 - NodeID: 2 StoreID: 2 Descriptors: @@ -72,6 +75,9 @@ dump-store stores=(1,2) - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} - Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4} - Replica: {NodeID: 5, StoreID: 5, ReplicaID: 5} + LocalData: + - RangeID: 1 + RaftReplicaID: 2 # Second use case where we can't make a decision and fail keyspace coverage as # only a single learner is left, there is no way to recover. diff --git a/pkg/kv/kvserver/loqrecovery/testdata/max_applied_voter_wins b/pkg/kv/kvserver/loqrecovery/testdata/max_applied_voter_wins index 7315c7be9b53..bebb99bd2568 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/max_applied_voter_wins +++ b/pkg/kv/kvserver/loqrecovery/testdata/max_applied_voter_wins @@ -59,6 +59,9 @@ dump-store stores=(1,2) StartKey: /Min Replicas: - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 16} + LocalData: + - RangeID: 1 + RaftReplicaID: 16 - NodeID: 2 StoreID: 2 Descriptors: @@ -70,6 +73,9 @@ dump-store stores=(1,2) - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} - Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4} - Replica: {NodeID: 5, StoreID: 5, ReplicaID: 5} + LocalData: + - RangeID: 1 + RaftReplicaID: 2 dump-events stores=(1,2) ---- @@ -184,6 +190,11 @@ dump-store stores=(1,2,5,6) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 5, StoreID: 5, ReplicaID: 6} - Replica: {NodeID: 6, StoreID: 6, ReplicaID: 7} + LocalData: + - RangeID: 1 + RaftReplicaID: 1 + - RangeID: 2 + RaftReplicaID: 1 - NodeID: 2 StoreID: 2 Descriptors: @@ -193,6 +204,9 @@ dump-store stores=(1,2,5,6) - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2} - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} - Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4} + LocalData: + - RangeID: 1 + RaftReplicaID: 2 - NodeID: 5 StoreID: 5 Descriptors: @@ -208,6 +222,11 @@ dump-store stores=(1,2,5,6) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 5, StoreID: 5, ReplicaID: 6} - Replica: {NodeID: 6, StoreID: 6, ReplicaID: 7} + LocalData: + - RangeID: 1 + RaftReplicaID: 6 + - RangeID: 2 + RaftReplicaID: 6 - NodeID: 6 StoreID: 6 Descriptors: @@ -223,3 +242,8 @@ dump-store stores=(1,2,5,6) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 5, StoreID: 5, ReplicaID: 6} - Replica: {NodeID: 6, StoreID: 6, ReplicaID: 7} + LocalData: + - RangeID: 1 + RaftReplicaID: 7 + - RangeID: 2 + RaftReplicaID: 7 diff --git a/pkg/kv/kvserver/loqrecovery/testdata/max_store_voter_wins b/pkg/kv/kvserver/loqrecovery/testdata/max_store_voter_wins index d3ca49ad2bdd..758dfdcdb0af 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/max_store_voter_wins +++ b/pkg/kv/kvserver/loqrecovery/testdata/max_store_voter_wins @@ -64,6 +64,9 @@ dump-store stores=(1,2) - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} - Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4} - Replica: {NodeID: 5, StoreID: 5, ReplicaID: 5} + LocalData: + - RangeID: 1 + RaftReplicaID: 1 - NodeID: 2 StoreID: 2 Descriptors: @@ -71,3 +74,6 @@ dump-store stores=(1,2) StartKey: /Min Replicas: - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 16} + LocalData: + - RangeID: 1 + RaftReplicaID: 16 diff --git a/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_no_dead_peers b/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_no_dead_peers index b149470f9a40..92df255694cd 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_no_dead_peers +++ b/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_no_dead_peers @@ -58,6 +58,9 @@ dump-store stores=(1,2,3) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2} - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} + LocalData: + - RangeID: 1 + RaftReplicaID: 1 - NodeID: 2 StoreID: 2 Descriptors: @@ -67,6 +70,9 @@ dump-store stores=(1,2,3) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2} - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} + LocalData: + - RangeID: 1 + RaftReplicaID: 2 - NodeID: 3 StoreID: 3 Descriptors: @@ -76,3 +82,6 @@ dump-store stores=(1,2,3) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2} - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} + LocalData: + - RangeID: 1 + RaftReplicaID: 3 diff --git a/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_quorum b/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_quorum index b92249ba6e94..b6c9cf1dea09 100644 --- a/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_quorum +++ b/pkg/kv/kvserver/loqrecovery/testdata/no_change_when_quorum @@ -47,6 +47,9 @@ dump-store stores=(1,2) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2} - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} + LocalData: + - RangeID: 1 + RaftReplicaID: 1 - NodeID: 2 StoreID: 2 Descriptors: @@ -56,3 +59,6 @@ dump-store stores=(1,2) - Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1} - Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2} - Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3} + LocalData: + - RangeID: 1 + RaftReplicaID: 2