From be85b004f8cf3c4741c951bdf0250278d65a066d Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Fri, 17 Apr 2020 16:38:02 -0400 Subject: [PATCH] kv: deflake TestInitRaftGroupOnRequest Fixes #42808. Fixes #44146. Fixes #47020. Fixes #47551. Fixes #47231. Disable async intent resolution. This can lead to flakiness in the test because it allows for the intents written by the split transaction to be resolved at any time, including after the nodes are restarted. The intent resolution on the RHS's local range descriptor intent can both wake up the RHS range's Raft group and result in the wrong replica acquiring the lease. I was always seeing this in conjunction with the log line: ``` kv/kvserver/intentresolver/intent_resolver.go:746 failed to gc transaction record: could not GC completed transaction anchored at /Local/Range/Table/50/RangeDescriptor: node unavailable; try another peer ``` Before the fix, the test failed almost immediately when stressed on a roachprod cluster. After, I've never seen it flake: ``` 576962 runs so far, 0 failures, over 19m35s ``` I think this may have gotten more flaky after we began batching intent resolution, as this batching also introduced a delay to the async task. I'll backport this to the past few release branches. --- pkg/kv/kvserver/client_raft_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 7858640bc4d8..05c3127b0c34 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -3914,6 +3914,12 @@ func TestInitRaftGroupOnRequest(t *testing.T) { defer leaktest.AfterTest(t)() storeCfg := kvserver.TestStoreConfig(nil /* clock */) storeCfg.TestingKnobs.DisableMergeQueue = true + // Disable async intent resolution. This can lead to flakiness in the test + // because it allows for the intents written by the split transaction to be + // resolved at any time, including after the nodes are restarted. The intent + // resolution on the RHS's local range descriptor can both wake up the RHS + // range's Raft group and result in the wrong replica acquiring the lease. + storeCfg.TestingKnobs.IntentResolverKnobs.DisableAsyncIntentResolution = true mtc := &multiTestContext{ storeConfig: &storeCfg, // TODO(andrei): This test was written before multiTestContexts started with @@ -3945,6 +3951,8 @@ func TestInitRaftGroupOnRequest(t *testing.T) { mtc.restart() // Get replica from the store which isn't the leaseholder. + // NOTE: StoreID is 1-indexed and storeIdx is 0-indexed, so despite what + // this might look like, this is grabbing the replica without the lease. storeIdx := int(lease.Replica.StoreID) % len(mtc.stores) if repl = mtc.stores[storeIdx].LookupReplica(roachpb.RKey(splitKey)); repl == nil { t.Fatal("replica should not be nil for RHS range")