From a4ab9b6141e7b6ee8ee1d4b1ec63ee9bd13cfe2b Mon Sep 17 00:00:00 2001 From: Steven Danna Date: Tue, 15 Nov 2022 02:19:16 +0000 Subject: [PATCH] changfeedccl: deflake TestAlterChangefeedTelemetry The job system clears the lease asyncronously after cancelation. This lease clearing transaction can cause a restart in the alter changefeed transaction, which will lead to different feature counter counts. Thus, we want to wait for the lease clear. However, the lease clear isn't guaranteed to happen, so we only wait a few seconds for it. Release note: None --- pkg/ccl/changefeedccl/BUILD.bazel | 1 - .../changefeedccl/alter_changefeed_test.go | 28 +++++++++++++++++-- pkg/testutils/jobutils/jobs_verification.go | 13 --------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pkg/ccl/changefeedccl/BUILD.bazel b/pkg/ccl/changefeedccl/BUILD.bazel index ce5b5782aeb6..5cccc7d139bf 100644 --- a/pkg/ccl/changefeedccl/BUILD.bazel +++ b/pkg/ccl/changefeedccl/BUILD.bazel @@ -250,7 +250,6 @@ go_test( "//pkg/sql/types", "//pkg/storage", "//pkg/testutils", - "//pkg/testutils/jobutils", "//pkg/testutils/serverutils", "//pkg/testutils/skip", "//pkg/testutils/sqlutils", diff --git a/pkg/ccl/changefeedccl/alter_changefeed_test.go b/pkg/ccl/changefeedccl/alter_changefeed_test.go index 759df1a35439..367c0966f030 100644 --- a/pkg/ccl/changefeedccl/alter_changefeed_test.go +++ b/pkg/ccl/changefeedccl/alter_changefeed_test.go @@ -27,7 +27,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/execinfra" "github.com/cockroachdb/cockroach/pkg/sql/tests" "github.com/cockroachdb/cockroach/pkg/testutils" - "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" @@ -423,11 +422,34 @@ func TestAlterChangefeedTelemetry(t *testing.T) { testFeed := feed(t, f, `CREATE CHANGEFEED FOR foo, bar WITH diff`) defer closeFeed(t, testFeed) - feed := testFeed.(cdctest.EnterpriseTestFeed) require.NoError(t, feed.Pause()) - jobutils.WaitForJobToHaveNoLease(t, sqlDB, feed.JobID()) + + // The job system clears the lease asyncronously after + // cancellation. This lease clearing transaction can + // cause a restart in the alter changefeed + // transaction, which will lead to different feature + // counter counts. Thus, we want to wait for the lease + // clear. However, the lease clear isn't guaranteed to + // happen, so we only wait a few seconds for it. + waitForNoLease := func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + for { + if ctx.Err() != nil { + return + } + var sessionID []byte + sqlDB.QueryRow(t, `SELECT claim_session_id FROM system.jobs WHERE id = $1`, feed.JobID()).Scan(&sessionID) + if sessionID == nil { + return + } + time.Sleep(250 * time.Millisecond) + } + } + + waitForNoLease() sqlDB.Exec(t, fmt.Sprintf(`ALTER CHANGEFEED %d DROP bar, foo ADD baz UNSET diff SET resolved, format=json`, feed.JobID())) counts := telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ResetCounts) diff --git a/pkg/testutils/jobutils/jobs_verification.go b/pkg/testutils/jobutils/jobs_verification.go index 7e5d8c9f1644..1872596d41c7 100644 --- a/pkg/testutils/jobutils/jobs_verification.go +++ b/pkg/testutils/jobutils/jobs_verification.go @@ -89,19 +89,6 @@ func waitForJobToHaveStatus( }, 2*time.Minute) } -func WaitForJobToHaveNoLease(t testing.TB, db *sqlutils.SQLRunner, jobID jobspb.JobID) { - t.Helper() - testutils.SucceedsWithin(t, func() error { - var sessionID []byte - var instanceID gosql.NullInt64 - db.QueryRow(t, `SELECT claim_session_id, claim_instance_id FROM system.jobs WHERE id = $1`, jobID).Scan(&sessionID, &instanceID) - if sessionID == nil && !instanceID.Valid { - return nil - } - return errors.Newf("job %d still has claim information", jobID) - }, 2*time.Minute) -} - // RunJob runs the provided job control statement, initializing, notifying and // closing the chan at the passed pointer (see below for why) and returning the // jobID and error result. PAUSE JOB and CANCEL JOB are racy in that it's hard