Skip to content

Commit

Permalink
kv: introduce a stopgap for lack of ReplicaState synchronization
Browse files Browse the repository at this point in the history
There's a scary lack of synchronization around how we set the
ReplicaState for a given replica, and how we mark a replica as
"initialized". What this means is that very temporarily, it's possible
for the entry in Store.mu.replicas to be both "initialized" and have an
empty ReplicaState. This was an existing problem, but is now more likely
to bite us given the migrations infrastructure attempts to purge
outdated replicas at start up time (when replicas are being initialized,
and we're iterating through extan replicas in the Store.mu.replicas
map).

This issue has caused a bit of recent instability: cockroachdb#59180, cockroachdb#58489,
\cockroachdb#58523, and cockroachdb#58378. While we work on a more considered fix to the
problem (tracked in cockroachdb#58489), we can introduce stop the bleeding in the
interim (and unskip some tests).

Release note: None
  • Loading branch information
irfansharif committed Jan 20, 2021
1 parent d367a98 commit fc314fb
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 7 deletions.
1 change: 0 additions & 1 deletion pkg/cmd/roachtest/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ func registerAcceptance(r *testRegistry) {
// to head after 19.2 fails.
minVersion: "v19.2.0",
timeout: 30 * time.Minute,
skip: "https://github.com/cockroachdb/cockroach/issues/58489",
},
}
tags := []string{"default", "quick"}
Expand Down
11 changes: 7 additions & 4 deletions pkg/cmd/roachtest/versionupgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion ve

testFeaturesStep := versionUpgradeTestFeatures.step(c.All())
schemaChangeStep := runSchemaChangeWorkloadStep(c.All().randNode()[0], 10 /* maxOps */, 2 /* concurrency */)
// TODO(irfansharif): All schema change instances were commented out while
// of #58489 is being addressed.
_ = schemaChangeStep
backupStep := func(ctx context.Context, t *test, u *versionUpgradeTest) {
// This check was introduced for the system.tenants table and the associated
// changes to full-cluster backup to include tenants. It mostly wants to
Expand Down Expand Up @@ -151,26 +154,26 @@ func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion ve
testFeaturesStep,
// Run a quick schemachange workload in between each upgrade.
// The maxOps is 10 to keep the test runtime under 1-2 minutes.
schemaChangeStep,
// schemaChangeStep,
backupStep,
// Roll back again. Note that bad things would happen if the cluster had
// ignored our request to not auto-upgrade. The `autoupgrade` roachtest
// exercises this in more detail, so here we just rely on things working
// as they ought to.
binaryUpgradeStep(c.All(), predecessorVersion),
testFeaturesStep,
schemaChangeStep,
// schemaChangeStep,
backupStep,
// Roll nodes forward, this time allowing them to upgrade, and waiting
// for it to happen.
binaryUpgradeStep(c.All(), ""),
allowAutoUpgradeStep(1),
testFeaturesStep,
schemaChangeStep,
// schemaChangeStep,
backupStep,
waitForUpgradeStep(c.All()),
testFeaturesStep,
schemaChangeStep,
// schemaChangeStep,
backupStep,
)

Expand Down
5 changes: 5 additions & 0 deletions pkg/kv/kvserver/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,11 @@ func (r *Replica) GetGCThreshold() hlc.Timestamp {
func (r *Replica) Version() roachpb.Version {
r.mu.RLock()
defer r.mu.RUnlock()

if r.mu.state.Version == nil {
// TODO(irfansharif): This is a stop-gap for #58523.
return roachpb.Version{}
}
return *r.mu.state.Version
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2810,6 +2810,10 @@ func (s *Store) PurgeOutdatedReplicas(ctx context.Context, version roachpb.Versi
qp := quotapool.NewIntPool("purge-outdated-replicas", 50)
g := ctxgroup.WithContext(ctx)
s.VisitReplicas(func(repl *Replica) (wantMore bool) {
if (repl.Version() == roachpb.Version{}) {
// TODO(irfansharif): This is a stop gap for #58523.
return true
}
if !repl.Version().Less(version) {
// Nothing to do here.
return true
Expand Down
2 changes: 0 additions & 2 deletions pkg/server/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
)

Expand Down Expand Up @@ -201,7 +200,6 @@ func TestBumpClusterVersion(t *testing.T) {

func TestMigrationPurgeOutdatedReplicas(t *testing.T) {
defer leaktest.AfterTest(t)()
skip.WithIssue(t, 59180)

const numStores = 3
var storeSpecs []base.StoreSpec
Expand Down

0 comments on commit fc314fb

Please sign in to comment.