From 3b2ea0c128559f95d08886eb83268cccb3b88dda Mon Sep 17 00:00:00 2001 From: Nikhil Benesch Date: Thu, 18 Oct 2018 13:31:58 -0400 Subject: [PATCH] storage: always queue for replica GC upon intersecting snapshot If a store receives a snapshot that overlaps an existing replica, we take it as a sign that the local replica may no longer be a member of its range and queue it for processing in the replica GC queue. When this code was added (#10426), the replica GC queue was quite aggressive about processing replicas, and so the implementation was careful to only queue a replica if it looked "inactive." Unfortunately, this inactivity check rotted when epoch-based leases were introduced a month later (#10305). An inactive replica with an epoch-based lease can incorrectly be considered active, even if all of the other members of the range have stopped sending it messages, because the epoch-based lease will continue to be heartbeated by the node itself. (With an expiration-based lease, the replica's local copy of the lease would quickly expire if the other members of the range stopped sending it messages.) Fixing the inactivity check to work with epoch-based leases is rather tricky. Quiescent replicas are nearly indistinguishable from abandoned replicas. This commit just removes the inactivity check and unconditionally queues replicas for GC if they intersect an incoming snapshot. The replica GC queue check is relatively cheap (one or two meta2 lookups), and overlapping snapshot situations are not expected to last for very long. Release note: None --- pkg/storage/store_snapshot.go | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pkg/storage/store_snapshot.go b/pkg/storage/store_snapshot.go index a48e58ffbac4..9cefb6703ed9 100644 --- a/pkg/storage/store_snapshot.go +++ b/pkg/storage/store_snapshot.go @@ -437,20 +437,35 @@ func (s *Store) canApplySnapshotLocked( if r.RaftStatus() == nil { return true } + // TODO(benesch): this check does detect inactivity on replicas with + // epoch-based leases. Since the validity of an epoch-based lease is + // tied to the owning node's liveness, the lease can be valid well after + // the leader of the range has cut off communication with this replica. + // Expiration based leases, by contrast, will expire quickly if the + // leader of the range stops sending this replica heartbeats. lease, pendingLease := r.GetLease() now := s.Clock().Now() return !r.IsLeaseValid(lease, now) && (pendingLease == (roachpb.Lease{}) || !r.IsLeaseValid(pendingLease, now)) } - - // If the existing range shows no signs of recent activity, give it a GC - // run. + // We unconditionally send this replica through the GC queue. It's + // reasonably likely that the GC queue will do nothing because the replica + // needs to split instead, but better to err on the side of queueing too + // frequently. Blocking Raft snapshots for too long can wedge a cluster, + // and if the replica does need to be GC'd, this might be the only code + // path that notices in a timely fashion. + // + // We're careful to avoid starving out other replicas in the GC queue by + // queueing at a low priority unless we can prove that the range is + // inactive and thus unlikely to be about to process a split. + gcPriority := replicaGCPriorityDefault if inactive(exReplica) { - if _, err := s.replicaGCQueue.Add(exReplica, replicaGCPriorityCandidate); err != nil { - log.Errorf(ctx, "%s: unable to add replica to GC queue: %s", exReplica, err) - } else { - msg += "; initiated GC:" - } + gcPriority = replicaGCPriorityCandidate + } + if _, err := s.replicaGCQueue.Add(exReplica, gcPriority); err != nil { + log.Errorf(ctx, "%s: unable to add replica to GC queue: %s", exReplica, err) + } else { + msg += "; initiated GC:" } } return nil, errors.Errorf("%s %v", msg, exReplica) // exReplica can be nil