Skip to content

Commit

Permalink
Merge #80470
Browse files Browse the repository at this point in the history
80470: kvserver/loqrecovery: persist new replica ID in `RaftReplicaID` r=aliher1911 a=erikgrinaker

I'm not certain that this is the cause of #75133 (wasn't able to reproduce), but it seems plausible. I think it's something that we'd need to fix anyway, but I'm not familiar with all of the nuance here, so I'd appreciate careful reviews.

For reference, this was introduced in #75761.

---

**cli: persist new replica ID in `unsafe-remove-dead-replicas`**

The recently introduced local `RaftReplicaIDKey` was not updated when
`unsafe-remove-dead-replicas` changed the replica's ID. This could lead
to assertion failures.

Touches #75133.
Touches #79074.

Release note: None

**kvserver/loqrecovery: persist new replica ID in `RaftReplicaID`**

The recently introduced local `RaftReplicaIDKey` was not updated when
loss of quorum recovery changed the replica's ID. This could lead to
assertion failures.

Release note: None

Co-authored-by: Erik Grinaker <[email protected]>
  • Loading branch information
craig[bot] and erikgrinaker committed Apr 27, 2022
2 parents 0020ddd + 00e7fdf commit ce57830
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 2 deletions.
9 changes: 9 additions & 0 deletions pkg/cli/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -1371,6 +1371,15 @@ func removeDeadReplicas(
batch.Close()
return nil, err
}
// Write the new replica ID to RaftReplicaIDKey.
replicas := desc.Replicas().Descriptors()
if len(replicas) != 1 {
return nil, errors.Errorf("expected 1 replica, got %v", replicas)
}
if err := sl.SetRaftReplicaID(ctx, batch, replicas[0].ReplicaID); err != nil {
return nil, errors.Wrapf(err, "failed to write new replica ID for range %d", desc.RangeID)
}
// Update MVCC stats.
if err := sl.SetMVCCStats(ctx, batch, &ms); err != nil {
return nil, errors.Wrap(err, "updating MVCCStats")
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/kv/kvserver/loqrecovery/apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,11 @@ func applyReplicaUpdate(
report.OldReplica, _ = report.RemovedReplicas.RemoveReplica(
update.NewReplica.NodeID, update.NewReplica.StoreID)

// Persist the new replica ID.
if err := sl.SetRaftReplicaID(ctx, readWriter, update.NewReplica.ReplicaID); err != nil {
return PrepareReplicaReport{}, errors.Wrap(err, "setting new replica ID")
}

// Refresh stats
if err := sl.SetMVCCStats(ctx, readWriter, &ms); err != nil {
return PrepareReplicaReport{}, errors.Wrap(err, "updating MVCCStats")
Expand Down
32 changes: 30 additions & 2 deletions pkg/kv/kvserver/loqrecovery/recovery_env_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ type storeView struct {
StoreID roachpb.StoreID `yaml:"StoreID"`

Descriptors []storeDescriptorView `yaml:"Descriptors"`
LocalData []localDataView `yaml:"LocalData"`
}

// storeDescriptorView contains important fields from the range
Expand Down Expand Up @@ -119,6 +120,12 @@ func (r replicaDescriptorView) asReplicaDescriptor() roachpb.ReplicaDescriptor {
}
}

// localDataView contains interesting local store data for each range.
type localDataView struct {
RangeID roachpb.RangeID `yaml:"RangeID"`
RaftReplicaID int `yaml:"RaftReplicaID"`
}

// Store with its owning NodeID for easier grouping by owning nodes.
type wrappedStore struct {
engine storage.Engine
Expand Down Expand Up @@ -195,7 +202,8 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes
replica.NodeID = roachpb.NodeID(replica.StoreID)
}

key, desc, replicaState, hardState, raftLog := buildReplicaDescriptorFromTestData(t, replica)
replicaID, key, desc, replicaState, hardState, raftLog :=
buildReplicaDescriptorFromTestData(t, replica)

eng := e.getOrCreateStore(ctx, t, replica.StoreID, replica.NodeID)
if err = storage.MVCCPutProto(ctx, eng, nil, key, clock.Now(), nil, /* txn */
Expand All @@ -210,6 +218,9 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes
if err := sl.SetHardState(ctx, eng, hardState); err != nil {
t.Fatalf("failed to save raft hard state: %v", err)
}
if err := sl.SetRaftReplicaID(ctx, eng, replicaID); err != nil {
t.Fatalf("failed to set raft replica ID: %v", err)
}
for i, entry := range raftLog {
value, err := protoutil.Marshal(&entry)
if err != nil {
Expand All @@ -227,6 +238,7 @@ func (e *quorumRecoveryEnv) handleReplicationData(t *testing.T, d datadriven.Tes
func buildReplicaDescriptorFromTestData(
t *testing.T, replica testReplicaInfo,
) (
roachpb.ReplicaID,
roachpb.Key,
roachpb.RangeDescriptor,
kvserverpb.ReplicaState,
Expand All @@ -239,12 +251,16 @@ func buildReplicaDescriptorFromTestData(
endKey := parsePrettyKey(t, replica.EndKey)
key := keys.RangeDescriptorKey(startKey)
var replicas []roachpb.ReplicaDescriptor
var replicaID roachpb.ReplicaID
maxReplicaID := replica.Replicas[0].ReplicaID
for _, r := range replica.Replicas {
if r.ReplicaID > maxReplicaID {
maxReplicaID = r.ReplicaID
}
replicas = append(replicas, r.asReplicaDescriptor())
if r.NodeID == replica.NodeID && r.StoreID == replica.StoreID {
replicaID = r.ReplicaID
}
}
if replica.Generation == 0 {
replica.Generation = roachpb.RangeGeneration(maxReplicaID)
Expand Down Expand Up @@ -292,7 +308,7 @@ func buildReplicaDescriptorFromTestData(
entry := raftLogFromPendingDescriptorUpdate(t, replica, u, desc, uint64(i))
raftLog = append(raftLog, enginepb.MVCCMetadata{RawBytes: entry.RawBytes})
}
return key, desc, replicaState, hardState, raftLog
return replicaID, key, desc, replicaState, hardState, raftLog
}

func raftLogFromPendingDescriptorUpdate(
Expand Down Expand Up @@ -520,10 +536,21 @@ func (e *quorumRecoveryEnv) handleDumpStore(t *testing.T, d datadriven.TestData)
var storesView []storeView
for _, storeID := range stores {
var descriptorViews []storeDescriptorView
var localDataViews []localDataView
store := e.stores[storeID]
err := kvserver.IterateRangeDescriptorsFromDisk(ctx, store.engine,
func(desc roachpb.RangeDescriptor) error {
descriptorViews = append(descriptorViews, descriptorView(desc))

sl := stateloader.Make(desc.RangeID)
raftReplicaID, _, err := sl.LoadRaftReplicaID(ctx, store.engine)
if err != nil {
t.Fatalf("failed to load Raft replica ID: %v", err)
}
localDataViews = append(localDataViews, localDataView{
RangeID: desc.RangeID,
RaftReplicaID: int(raftReplicaID.ReplicaID),
})
return nil
})
if err != nil {
Expand All @@ -533,6 +560,7 @@ func (e *quorumRecoveryEnv) handleDumpStore(t *testing.T, d datadriven.TestData)
NodeID: e.stores[storeID].nodeID,
StoreID: storeID,
Descriptors: descriptorViews,
LocalData: localDataViews,
})
}
out, err := yaml.Marshal(storesView)
Expand Down
6 changes: 6 additions & 0 deletions pkg/kv/kvserver/loqrecovery/testdata/learners_lose
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ dump-store stores=(1,2)
StartKey: /Min
Replicas:
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 16}
LocalData:
- RangeID: 1
RaftReplicaID: 16
- NodeID: 2
StoreID: 2
Descriptors:
Expand All @@ -72,6 +75,9 @@ dump-store stores=(1,2)
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
- Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4}
- Replica: {NodeID: 5, StoreID: 5, ReplicaID: 5}
LocalData:
- RangeID: 1
RaftReplicaID: 2

# Second use case where we can't make a decision and fail keyspace coverage as
# only a single learner is left, there is no way to recover.
Expand Down
24 changes: 24 additions & 0 deletions pkg/kv/kvserver/loqrecovery/testdata/max_applied_voter_wins
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ dump-store stores=(1,2)
StartKey: /Min
Replicas:
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 16}
LocalData:
- RangeID: 1
RaftReplicaID: 16
- NodeID: 2
StoreID: 2
Descriptors:
Expand All @@ -70,6 +73,9 @@ dump-store stores=(1,2)
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
- Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4}
- Replica: {NodeID: 5, StoreID: 5, ReplicaID: 5}
LocalData:
- RangeID: 1
RaftReplicaID: 2

dump-events stores=(1,2)
----
Expand Down Expand Up @@ -184,6 +190,11 @@ dump-store stores=(1,2,5,6)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 5, StoreID: 5, ReplicaID: 6}
- Replica: {NodeID: 6, StoreID: 6, ReplicaID: 7}
LocalData:
- RangeID: 1
RaftReplicaID: 1
- RangeID: 2
RaftReplicaID: 1
- NodeID: 2
StoreID: 2
Descriptors:
Expand All @@ -193,6 +204,9 @@ dump-store stores=(1,2,5,6)
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2}
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
- Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4}
LocalData:
- RangeID: 1
RaftReplicaID: 2
- NodeID: 5
StoreID: 5
Descriptors:
Expand All @@ -208,6 +222,11 @@ dump-store stores=(1,2,5,6)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 5, StoreID: 5, ReplicaID: 6}
- Replica: {NodeID: 6, StoreID: 6, ReplicaID: 7}
LocalData:
- RangeID: 1
RaftReplicaID: 6
- RangeID: 2
RaftReplicaID: 6
- NodeID: 6
StoreID: 6
Descriptors:
Expand All @@ -223,3 +242,8 @@ dump-store stores=(1,2,5,6)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 5, StoreID: 5, ReplicaID: 6}
- Replica: {NodeID: 6, StoreID: 6, ReplicaID: 7}
LocalData:
- RangeID: 1
RaftReplicaID: 7
- RangeID: 2
RaftReplicaID: 7
6 changes: 6 additions & 0 deletions pkg/kv/kvserver/loqrecovery/testdata/max_store_voter_wins
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,16 @@ dump-store stores=(1,2)
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
- Replica: {NodeID: 4, StoreID: 4, ReplicaID: 4}
- Replica: {NodeID: 5, StoreID: 5, ReplicaID: 5}
LocalData:
- RangeID: 1
RaftReplicaID: 1
- NodeID: 2
StoreID: 2
Descriptors:
- RangeID: 1
StartKey: /Min
Replicas:
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 16}
LocalData:
- RangeID: 1
RaftReplicaID: 16
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ dump-store stores=(1,2,3)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2}
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
LocalData:
- RangeID: 1
RaftReplicaID: 1
- NodeID: 2
StoreID: 2
Descriptors:
Expand All @@ -67,6 +70,9 @@ dump-store stores=(1,2,3)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2}
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
LocalData:
- RangeID: 1
RaftReplicaID: 2
- NodeID: 3
StoreID: 3
Descriptors:
Expand All @@ -76,3 +82,6 @@ dump-store stores=(1,2,3)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2}
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
LocalData:
- RangeID: 1
RaftReplicaID: 3
6 changes: 6 additions & 0 deletions pkg/kv/kvserver/loqrecovery/testdata/no_change_when_quorum
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ dump-store stores=(1,2)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2}
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
LocalData:
- RangeID: 1
RaftReplicaID: 1
- NodeID: 2
StoreID: 2
Descriptors:
Expand All @@ -56,3 +59,6 @@ dump-store stores=(1,2)
- Replica: {NodeID: 1, StoreID: 1, ReplicaID: 1}
- Replica: {NodeID: 2, StoreID: 2, ReplicaID: 2}
- Replica: {NodeID: 3, StoreID: 3, ReplicaID: 3}
LocalData:
- RangeID: 1
RaftReplicaID: 2

0 comments on commit ce57830

Please sign in to comment.