Skip to content

Commit

Permalink
storage: Move replicaGCQueue logic out of LocalProposalData
Browse files Browse the repository at this point in the history
LocalProposalData is processed only on the proposer, which is unlikely in
practice to be the node being removed (but was apparently the case in
all of our tests except in rare cases).

This deflakes the SplitSnapshotRace tests, which would occasionally end
up with the lease on a different store and miss the automatic GC (and
because the test uses a manual clock, the background replicaGCQueue scan
wouldn't GC them either).

Fixes #8416
Fixes #9204
  • Loading branch information
bdarnell committed Nov 7, 2016
1 parent 278e8b1 commit 29287c3
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 31 deletions.
1 change: 0 additions & 1 deletion pkg/storage/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,6 @@ func TestSplitSnapshotRace_SplitWins(t *testing.T) {
// so it still has a conflicting range.
func TestSplitSnapshotRace_SnapshotWins(t *testing.T) {
defer leaktest.AfterTest(t)()
t.Skip("#8416")
runSetupSplitSnapshotRace(t, func(mtc *multiTestContext, leftKey, rightKey roachpb.Key) {
// Bring the right range up first.
for i := 3; i <= 5; i++ {
Expand Down
23 changes: 7 additions & 16 deletions pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2994,22 +2994,13 @@ func (r *Replica) changeReplicasTrigger(
ctx context.Context, batch engine.Batch, change *roachpb.ChangeReplicasTrigger,
) ProposalData {
var pd ProposalData
// If we're removing the current replica, add it to the range GC queue.
if change.ChangeType == roachpb.REMOVE_REPLICA && r.store.StoreID() == change.Replica.StoreID {
// This wants to run as late as possible, maximizing the chances
// that the other nodes have finished this command as well (since
// processing the removal from the queue looks up the Range at the
// lease holder, being too early here turns this into a no-op).
pd.addToReplicaGCQueue = true
} else {
// After a successful replica addition or removal check to see if the
// range needs to be split. Splitting usually takes precedence over
// replication via configuration of the split and replicate queues, but
// if the split occurs concurrently with the replicas change the split
// can fail and won't retry until the next scanner cycle. Re-queuing
// the replica here removes that latency.
pd.maybeAddToSplitQueue = true
}
// After a successful replica addition or removal check to see if the
// range needs to be split. Splitting usually takes precedence over
// replication via configuration of the split and replicate queues, but
// if the split occurs concurrently with the replicas change the split
// can fail and won't retry until the next scanner cycle. Re-queuing
// the replica here removes that latency.
pd.maybeAddToSplitQueue = true

// Gossip the first range whenever the range descriptor changes. We also
// gossip the first range whenever the lease holder changes, but that might
Expand Down
28 changes: 14 additions & 14 deletions pkg/storage/replica_proposal.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,13 @@ type LocalProposalData struct {
// The downstream-of-Raft logic does not exist at time of writing.
leaseMetricsResult *bool

// TODO(tschottdorf): there is no need to ever have these actions below
// taken on the followers, correct?

// When set (in which case we better be the first range), call
// gossipFirstRange if the Replica holds the lease.
gossipFirstRange bool
// Call maybeGossipSystemConfig.
maybeGossipSystemConfig bool
// Call maybeAddToSplitQueue.
maybeAddToSplitQueue bool
// Call maybeAddToReplicaGCQueue.
addToReplicaGCQueue bool
// Call maybeGossipNodeLiveness with the specified Span, if set.
maybeGossipNodeLiveness *roachpb.Span
}
Expand Down Expand Up @@ -238,7 +233,6 @@ func (p *ProposalData) MergeAndDestroy(q ProposalData) error {
coalesceBool(&p.gossipFirstRange, &q.gossipFirstRange)
coalesceBool(&p.maybeGossipSystemConfig, &q.maybeGossipSystemConfig)
coalesceBool(&p.maybeAddToSplitQueue, &q.maybeAddToSplitQueue)
coalesceBool(&p.addToReplicaGCQueue, &q.addToReplicaGCQueue)

if (q != ProposalData{}) {
log.Fatalf(context.TODO(), "unhandled ProposalData: %s", pretty.Diff(q, ProposalData{}))
Expand Down Expand Up @@ -512,6 +506,20 @@ func (r *Replica) handleReplicatedProposalData(
)
}
rpd.State.Desc = nil
}

if change := rpd.ChangeReplicas; change != nil {
if change.ChangeType == roachpb.REMOVE_REPLICA &&
r.store.StoreID() == change.Replica.StoreID {
// This wants to run as late as possible, maximizing the chances
// that the other nodes have finished this command as well (since
// processing the removal from the queue looks up the Range at the
// lease holder, being too early here turns this into a no-op).
if _, err := r.store.replicaGCQueue.Add(r, replicaGCPriorityRemoved); err != nil {
// Log the error; the range should still be GC'd eventually.
log.Errorf(ctx, "unable to add to replica GC queue: %s", err)
}
}
rpd.ChangeReplicas = nil
}

Expand Down Expand Up @@ -631,14 +639,6 @@ func (r *Replica) handleLocalProposalData(
lpd.gossipFirstRange = false
}

if lpd.addToReplicaGCQueue {
if _, err := r.store.replicaGCQueue.Add(r, replicaGCPriorityRemoved); err != nil {
// Log the error; the range should still be GC'd eventually.
log.Errorf(ctx, "unable to add to replica GC queue: %s", err)
}
lpd.addToReplicaGCQueue = false
}

if lpd.maybeAddToSplitQueue {
r.store.splitQueue.MaybeAdd(r, r.store.Clock().Now())
lpd.maybeAddToSplitQueue = false
Expand Down

0 comments on commit 29287c3

Please sign in to comment.