Skip to content

Commit

Permalink
multiregionccl: deflake the multi-region cold start test
Browse files Browse the repository at this point in the history
I ran the test 2500 times with no flakes. The core source of the flakes
is the test depends on every range having an available replica in every
region. Any amount of replica movement causes the test to fail because
many regions only have a single replica and temporary unavailability
causes the sql servers to pick a replica that is available, but
unreachable due to a networking hook injected by the test.

There are a few independent changes in this PR:

1. It includes the settings from #100320. The lease transfer setting
   improves test performance and disabling load based splitting helps
   avoid replica migration.
2. Based on the insight from #96334#issuecomment-1492610753 the test
   checks the span conformance report for all servers instead of only
   one of the servers.
3. The test no longer triggers scatters while waiting for the span
   config bounds to conform. The scatters increased test flakiness
   and are not required to achieve conformance as we proactively nudge
   replicas through the replicate queue after #100349.
4. WaitForFullReplication is called to ensure replication queues are
   empty and there are no pending range moves.

Fixes #96334

Release Note: None
  • Loading branch information
jeffswenson committed May 19, 2023
1 parent 56e15c3 commit 2cbd35e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 28 deletions.
1 change: 0 additions & 1 deletion pkg/ccl/multiregionccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ go_test(
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_google_grpc//:go_default_library",
"@org_golang_x_sync//errgroup",
],
)

Expand Down
46 changes: 19 additions & 27 deletions pkg/ccl/multiregionccl/cold_start_latency_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (
"github.com/jackc/pgx/v4"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
)

Expand All @@ -46,7 +45,6 @@ import (
func TestColdStartLatency(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.WithIssue(t, 96334)
skip.UnderRace(t, "too slow")
skip.UnderStress(t, "too slow")

Expand Down Expand Up @@ -147,6 +145,8 @@ func TestColdStartLatency(t *testing.T) {
// Shorten the closed timestamp target duration so that span configs
// propagate more rapidly.
tdb.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.target_duration = '200ms'`)
tdb.Exec(t, "SET CLUSTER SETTING kv.allocator.load_based_rebalancing = off")
tdb.Exec(t, "SET CLUSTER SETTING kv.allocator.min_lease_transfer_interval = '10ms'")
// Lengthen the lead time for the global tables to prevent overload from
// resulting in delays in propagating closed timestamps and, ultimately
// forcing requests from being redirected to the leaseholder. Without this
Expand Down Expand Up @@ -284,38 +284,30 @@ SELECT checkpoint > extract(epoch from after)
[][]string{{"true"}})
tenant.Stopper().Stop(ctx)
}

// Wait for the configs to be applied.
testutils.SucceedsWithin(t, func() error {
reporter := tc.Servers[0].Server.SpanConfigReporter()
report, err := reporter.SpanConfigConformance(ctx, []roachpb.Span{
{Key: keys.TableDataMin, EndKey: keys.TenantTableDataMax},
})
if err != nil {
return err
}
if !report.IsEmpty() {
var g errgroup.Group
for _, r := range report.ViolatingConstraints {
r := r // for closure
g.Go(func() error {
_, err := tc.Server(0).DB().AdminScatter(
ctx, r.RangeDescriptor.StartKey.AsRawKey(), 0,
)
return err
})
}
if err := g.Wait(); err != nil {
for _, server := range tc.Servers {
reporter := server.Server.SpanConfigReporter()
report, err := reporter.SpanConfigConformance(ctx, []roachpb.Span{
{Key: keys.TableDataMin, EndKey: keys.TenantTableDataMax},
})
if err != nil {
return err
}
return errors.Errorf("expected empty report, got: {over: %d, under: %d, violating: %d, unavailable: %d}",
len(report.OverReplicated),
len(report.UnderReplicated),
len(report.ViolatingConstraints),
len(report.Unavailable))
if !report.IsEmpty() {
return errors.Errorf("expected empty report, got: {over: %d, under: %d, violating: %d, unavailable: %d}",
len(report.OverReplicated),
len(report.UnderReplicated),
len(report.ViolatingConstraints),
len(report.Unavailable))
}
}
return nil
}, 5*time.Minute)

require.NoError(t, tc.WaitForFullReplication())

doTest := func(wg *sync.WaitGroup, qp *quotapool.IntPool, i int, duration *time.Duration) {
defer wg.Done()
r, _ := qp.Acquire(ctx, 1)
Expand All @@ -331,8 +323,8 @@ SELECT checkpoint > extract(epoch from after)
},
Locality: localities[i],
})
defer tenant.Stopper().Stop(ctx)
require.NoError(t, err)
defer tenant.Stopper().Stop(ctx)
pgURL, cleanup, err := sqlutils.PGUrlWithOptionalClientCertsE(
tenant.SQLAddr(), "tenantdata", url.UserPassword("foo", password),
false, // withClientCerts
Expand Down

0 comments on commit 2cbd35e

Please sign in to comment.