Skip to content

Commit

Permalink
kvserver/loqrecovery: check full key coverage in quorum recovery
Browse files Browse the repository at this point in the history
Previously when doing unsafe replica recovery, if some ranges are
missing or represented by stale replicas that were split or merged,
recovery will change cluster to inconsistent state with gaps or
overlaps in keyspace.
This change adds checks for range completeness as well as adds a
preference for replicas with higher range applied index.

Release note: None
  • Loading branch information
aliher1911 committed Jan 5, 2022
1 parent 01c0c73 commit 3c10d6c
Show file tree
Hide file tree
Showing 13 changed files with 634 additions and 185 deletions.
5 changes: 3 additions & 2 deletions pkg/cli/debug_recover_loss_of_quorum.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,9 @@ Discarded live replicas: %d
`, report.TotalReplicas, len(report.PlannedUpdates), report.DiscardedNonSurvivors)
for _, r := range report.PlannedUpdates {
_, _ = fmt.Fprintf(stderr, "Recovering range r%d:%s updating replica %s to %s. "+
"Discarding replicas: %s\n",
r.RangeID, r.StartKey, r.OldReplica, r.Replica, r.DiscardedReplicas)
"Discarding available replicas: [%s], discarding dead replicas: [%s].\n",
r.RangeID, r.StartKey, r.OldReplica, r.Replica,
r.DiscardedAvailableReplicas, r.DiscardedDeadReplicas)
}

deadStoreMsg := fmt.Sprintf("\nDiscovered dead stores from provided files: %s",
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvserver/loqrecovery/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ go_test(
"//pkg/util/leaktest",
"//pkg/util/uuid",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
"@in_gopkg_yaml_v2//:yaml_v2",
"@io_etcd_go_etcd_raft_v3//raftpb",
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/kvserver/loqrecovery/apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func PrepareUpdateReplicas(
}

if len(missing) > 0 {
report.MissingStores = storeListFromSet(missing)
report.MissingStores = storeSliceFromSet(missing)
}
return report, nil
}
Expand All @@ -113,7 +113,7 @@ func applyReplicaUpdate(
clock := hlc.NewClock(hlc.UnixNano, 0)
report := PrepareReplicaReport{
RangeID: update.RangeID,
Replica: *update.NewReplica,
Replica: update.NewReplica,
StartKey: update.StartKey.AsRKey(),
}

Expand Down
12 changes: 12 additions & 0 deletions pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,15 @@ func (m ReplicaUpdate) NodeID() roachpb.NodeID {
func (m ReplicaUpdate) StoreID() roachpb.StoreID {
return m.NewReplica.StoreID
}

// Replica gets replica for the store where this info and range
// descriptor were collected. Returns err if it can't find replica
// descriptor for the store it originated from.
func (m *ReplicaInfo) Replica() (roachpb.ReplicaDescriptor, error) {
if d, ok := m.Desc.GetReplicaDescriptor(m.StoreID); ok {
return d, nil
}
return roachpb.ReplicaDescriptor{}, errors.Errorf(
"invalid replica info: its own store s%d is not present in descriptor replicas %s",
m.StoreID, m.Desc)
}
3 changes: 2 additions & 1 deletion pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.proto
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ message ReplicaUpdate {
int32 old_replica_id = 3 [(gogoproto.customname) = "OldReplicaID",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID",
(gogoproto.moretags) = "yaml:\"OldReplicaID\""];
roachpb.ReplicaDescriptor new_replica = 4 [(gogoproto.moretags) = "yaml:\"NewReplica\""];
roachpb.ReplicaDescriptor new_replica = 4 [(gogoproto.nullable) = false,
(gogoproto.moretags) = "yaml:\"NewReplica\""];
int32 next_replica_id = 5 [(gogoproto.customname) = "NextReplicaID",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID",
(gogoproto.moretags) = "yaml:\"NextReplicaID\""];
Expand Down
Loading

0 comments on commit 3c10d6c

Please sign in to comment.