Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
103666: multiregionccl: deflake the multi-region cold start test r=JeffSwenson a=JeffSwenson

I ran the test 2500 times with no flakes. The core source of the flakes
is the test depends on every range having an available replica in every
region. Any amount of replica movement causes the test to fail because
many regions only have a single replica and temporary unavailability
causes the sql servers to pick a replica that is available, but
unreachable due to a networking hook injected by the test.

There are a few independent changes in this PR:

1. It includes the settings from #100320. The lease transfer setting
   improves test performance and disabling load based splitting helps
   avoid replica migration.
2. Based on the insight from #96334 (comment) the test
   checks the span conformance report for all servers instead of only
   one of the servers.
3. The test no longer triggers scatters while waiting for the span
   config bounds to conform. The scatters increased test flakiness
   and are not required to achieve conformance as we proactively nudge
   replicas through the replicate queue after #100349.
4. WaitForFullReplication is called to ensure replication queues are
   empty and there are no pending range moves.

Fixes #96334

Release Note: None

103680: roachtest: fix disk-stalled/* roachtests r=jbowens a=jbowens

In #103198, I mistakenly constructed a context with a timeout outside the deferral, resulting in the defer'd Unstall always timing out if the remainder of test took longer than a minute.

Epic: None
Fixes: #103664.
Fixes: #103663.
Fixes: #103662.
Fixes: #103661.
Release note: none

Co-authored-by: Jeff <[email protected]>
Co-authored-by: Jackson Owens <[email protected]>
  • Loading branch information
3 people committed May 19, 2023
3 parents 2306edf + 2cbd35e + 4276890 commit a269433
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 35 deletions.
1 change: 0 additions & 1 deletion pkg/ccl/multiregionccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ go_test(
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_google_grpc//:go_default_library",
"@org_golang_x_sync//errgroup",
],
)

Expand Down
46 changes: 19 additions & 27 deletions pkg/ccl/multiregionccl/cold_start_latency_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (
"github.com/jackc/pgx/v4"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
)

Expand All @@ -46,7 +45,6 @@ import (
func TestColdStartLatency(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.WithIssue(t, 96334)
skip.UnderRace(t, "too slow")
skip.UnderStress(t, "too slow")

Expand Down Expand Up @@ -147,6 +145,8 @@ func TestColdStartLatency(t *testing.T) {
// Shorten the closed timestamp target duration so that span configs
// propagate more rapidly.
tdb.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.target_duration = '200ms'`)
tdb.Exec(t, "SET CLUSTER SETTING kv.allocator.load_based_rebalancing = off")
tdb.Exec(t, "SET CLUSTER SETTING kv.allocator.min_lease_transfer_interval = '10ms'")
// Lengthen the lead time for the global tables to prevent overload from
// resulting in delays in propagating closed timestamps and, ultimately
// forcing requests from being redirected to the leaseholder. Without this
Expand Down Expand Up @@ -284,38 +284,30 @@ SELECT checkpoint > extract(epoch from after)
[][]string{{"true"}})
tenant.Stopper().Stop(ctx)
}

// Wait for the configs to be applied.
testutils.SucceedsWithin(t, func() error {
reporter := tc.Servers[0].Server.SpanConfigReporter()
report, err := reporter.SpanConfigConformance(ctx, []roachpb.Span{
{Key: keys.TableDataMin, EndKey: keys.TenantTableDataMax},
})
if err != nil {
return err
}
if !report.IsEmpty() {
var g errgroup.Group
for _, r := range report.ViolatingConstraints {
r := r // for closure
g.Go(func() error {
_, err := tc.Server(0).DB().AdminScatter(
ctx, r.RangeDescriptor.StartKey.AsRawKey(), 0,
)
return err
})
}
if err := g.Wait(); err != nil {
for _, server := range tc.Servers {
reporter := server.Server.SpanConfigReporter()
report, err := reporter.SpanConfigConformance(ctx, []roachpb.Span{
{Key: keys.TableDataMin, EndKey: keys.TenantTableDataMax},
})
if err != nil {
return err
}
return errors.Errorf("expected empty report, got: {over: %d, under: %d, violating: %d, unavailable: %d}",
len(report.OverReplicated),
len(report.UnderReplicated),
len(report.ViolatingConstraints),
len(report.Unavailable))
if !report.IsEmpty() {
return errors.Errorf("expected empty report, got: {over: %d, under: %d, violating: %d, unavailable: %d}",
len(report.OverReplicated),
len(report.UnderReplicated),
len(report.ViolatingConstraints),
len(report.Unavailable))
}
}
return nil
}, 5*time.Minute)

require.NoError(t, tc.WaitForFullReplication())

doTest := func(wg *sync.WaitGroup, qp *quotapool.IntPool, i int, duration *time.Duration) {
defer wg.Done()
r, _ := qp.Acquire(ctx, 1)
Expand All @@ -331,8 +323,8 @@ SELECT checkpoint > extract(epoch from after)
},
Locality: localities[i],
})
defer tenant.Stopper().Stop(ctx)
require.NoError(t, err)
defer tenant.Stopper().Stop(ctx)
pgURL, cleanup, err := sqlutils.PGUrlWithOptionalClientCertsE(
tenant.SQLAddr(), "tenantdata", url.UserPassword("foo", password),
false, // withClientCerts
Expand Down
14 changes: 7 additions & 7 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,15 @@ func runDiskStalledDetection(
m.ExpectDeath()
}
s.Stall(ctx, c.Node(1))
{
// NB: We use a background context in the defer'ed unstall command,
// otherwise on test failure our c.Run calls will be ignored. Leaving
// the disk stalled will prevent artifact collection, making debugging
// difficult.
// NB: We use a background context in the defer'ed unstall command,
// otherwise on test failure our c.Run calls will be ignored. Leaving
// the disk stalled will prevent artifact collection, making debugging
// difficult.
defer func() {
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
defer cancel()
defer s.Unstall(ctx, c.Node(1))
}
s.Unstall(ctx, c.Node(1))
}()

// Wait twice the maximum sync duration and check if our SQL connection to
// node 1 is still alive. It should've been terminated.
Expand Down

0 comments on commit a269433

Please sign in to comment.