Skip to content

Commit

Permalink
Merge pull request #99351 from cockroachdb/blathers/backport-release-…
Browse files Browse the repository at this point in the history
…22.2-99020
  • Loading branch information
aliher1911 authored Mar 30, 2023
2 parents e479ff9 + c10b296 commit 2459f2e
Showing 1 changed file with 61 additions and 24 deletions.
85 changes: 61 additions & 24 deletions pkg/cmd/roachtest/tests/replicagc.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ func registerReplicaGC(r registry.Registry) {

var deadNodeAttr = "deadnode"

const storeDeadTimeout = 60 * time.Second

// runReplicaGCChangedPeers checks that when a node has all of its replicas
// taken away in absentia restarts, without it being able to talk to any of its
// old peers, it will still replicaGC its (now stale) replicas quickly.
Expand All @@ -61,7 +63,6 @@ func runReplicaGCChangedPeers(
}

c.Put(ctx, t.Cockroach(), "./cockroach")
c.Put(ctx, t.DeprecatedWorkload(), "./workload", c.Node(1))
settings := install.MakeClusterSettings(install.EnvOption([]string{"COCKROACH_SCAN_MAX_IDLE_TIME=5ms"}))
c.Start(ctx, t.L(), option.DefaultStartOpts(), settings, c.Range(1, 3))

Expand All @@ -70,8 +71,20 @@ func runReplicaGCChangedPeers(
t.Status("waiting for full replication")
h.waitForFullReplication(ctx)

// Set dead store timeout to 1 minute to work around allocator issue mentioned
// in waitForZeroReplicasOnN3.
func() {
db := c.Conn(ctx, t.L(), 1)
defer db.Close()
if _, err := db.ExecContext(ctx,
"SET CLUSTER SETTING server.time_until_store_dead = $1", storeDeadTimeout.String(),
); err != nil {
t.Fatal(err)
}
}()

// Fill in a bunch of data.
c.Run(ctx, c.Node(1), "./workload init kv {pgurl:1} --splits 100")
c.Run(ctx, c.Node(1), "./cockroach workload init kv {pgurl:1} --splits 100")

// Kill the third node so it won't know that all of its replicas are moved
// elsewhere (we don't use the first because that's what roachprod will
Expand Down Expand Up @@ -141,7 +154,7 @@ func runReplicaGCChangedPeers(
t.Fatal(err)
}
startOpts := option.DefaultStartOpts()
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--join="+internalAddrs[0], "--attrs="+deadNodeAttr, "--vmodule=raft=5,replicate_queue=5,allocator=5")
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--join="+internalAddrs[0], "--attrs="+deadNodeAttr)
c.Start(ctx, t.L(), startOpts, settings, c.Node(3))

// Loop for two metric sample intervals (10s) to make sure n3 doesn't see any
Expand Down Expand Up @@ -206,7 +219,7 @@ func (h *replicagcTestHelper) numReplicas(ctx context.Context, db *gosql.DB, tar
).Scan(&n); err != nil {
h.t.Fatal(err)
}
h.t.L().Printf("found %d replicas found on n%d\n", n, targetNode)
h.t.L().Printf("found %d replicas on n%d\n", n, targetNode)
return n
}

Expand Down Expand Up @@ -260,29 +273,53 @@ func (h *replicagcTestHelper) isolateDeadNodes(ctx context.Context, runNode int)
}

func waitForZeroReplicasOnN3(ctx context.Context, t test.Test, db *gosql.DB) {
if err := retry.ForDuration(5*time.Minute, func() error {
const q = `select range_id, replicas from crdb_internal.ranges_no_leases where replicas @> ARRAY[3];`
rows, err := db.QueryContext(ctx, q)
if err != nil {
return err
}
m := make(map[int64]string)
for rows.Next() {
var rangeID int64
var replicas string
if err := rows.Scan(&rangeID, &replicas); err != nil {
var err error
// Note that deadline is a special case for dead node. Dead nodes are not
// reported as decommissioning internally and stay as
// livenesspb.NodeLivenessStatus_UNAVAILABLE. This is preventing allocator
// from eagerly moving replicas off the node. It will only do so when node
// will be marked livenesspb.NodeLivenessStatus_DEAD. To reach latter status
// it has to stay unavailable for server.time_until_store_dead (configured
// down from 5 to 1 min in this test). So resulting timeout for replicas to
// move is 1 min + how much allocator needs to move replicas. Since we set it
// to process queue aggressively, we give it one minute. In practice, it
// usually takes about 10 seconds.
deadline := timeutil.Now().Add(storeDeadTimeout + time.Minute)
// We don't use exponential backoff here because we don't expect it to succeed
// immediately.
for r := retry.StartWithCtx(ctx, retry.Options{
InitialBackoff: 10 * time.Second,
MaxBackoff: 10 * time.Second,
}); r.Next() && timeutil.Now().Before(deadline); {
err = func() error {
const q = `select range_id, replicas from crdb_internal.ranges_no_leases where replicas @> ARRAY[3];`
rows, err := db.QueryContext(ctx, q)
if err != nil {
return err
}
m[rangeID] = replicas
}
if err := rows.Err(); err != nil {
return err
}
if len(m) == 0 {
return nil
m := make(map[int64]string)
for rows.Next() {
var rangeID int64
var replicas string
if err := rows.Scan(&rangeID, &replicas); err != nil {
return err
}
m[rangeID] = replicas
}
if err := rows.Err(); err != nil {
return err
}
t.L().Printf("found %d replicas on n3\n", len(m))
if len(m) == 0 {
return nil
}
return errors.Errorf("ranges remained on n3 (according to meta2): %+v", m)
}()
if err == nil {
break
}
return errors.Errorf("ranges remained on n3 (according to meta2): %+v", m)
}); err != nil {
}
if err != nil {
t.Fatal(err)
}
}

0 comments on commit 2459f2e

Please sign in to comment.