Skip to content

Commit

Permalink
ttljob: fix minor book-keeping bug during txn retry
Browse files Browse the repository at this point in the history
Previously, we would double-count the number of deleted rows in the
face of a transaction retry.

Here, we move the counting outside of the Txn func and use a small
helper function to force transaction retries during the test.

It wasn't clear to me whether we wanted the metric to be the total
number of deletion attempts or the total committed deletions.  I've
made it the latter.

It seems likely that a transaction retry here is not very likely at
all, but this was found when trying to apply random transaction
retries to the entire test suite.

Informs #106417

Epic: none

Release note: none
  • Loading branch information
stevendanna committed Jul 21, 2023
1 parent acdb0c7 commit a885652
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
1 change: 1 addition & 0 deletions pkg/sql/ttl/ttljob/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ go_test(
"//pkg/jobs/jobstest",
"//pkg/keys",
"//pkg/kv",
"//pkg/kv/kvserver",
"//pkg/roachpb",
"//pkg/scheduledjobs",
"//pkg/security/securityassets",
Expand Down
9 changes: 5 additions & 4 deletions pkg/sql/ttl/ttljob/ttljob_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,9 @@ func (t *ttlProcessor) runTTLOnQueryBounds(
until = numExpiredRows
}
deleteBatch := expiredRowsPKs[startRowIdx:until]
var batchRowCount int64
do := func(ctx context.Context, txn isql.Txn) error {
txn.KV().SetDebugName("ttljob-delete-batch")
// If we detected a schema change here, the DELETE will not succeed
// (the SELECT still will because of the AOST). Early exit here.
desc, err := flowCtx.Descriptors.ByIDWithLeased(txn.KV()).WithoutNonPublic().Get().Table(ctx, details.TableID)
Expand All @@ -334,21 +336,20 @@ func (t *ttlProcessor) runTTLOnQueryBounds(
defer tokens.Consume()

start := timeutil.Now()
batchRowCount, err := deleteBuilder.Run(ctx, txn, deleteBatch)
batchRowCount, err = deleteBuilder.Run(ctx, txn, deleteBatch)
if err != nil {
return err
}

metrics.DeleteDuration.RecordValue(int64(timeutil.Since(start)))
metrics.RowDeletions.Inc(batchRowCount)
spanRowCount += batchRowCount
return nil
}
if err := serverCfg.DB.Txn(
ctx, do, isql.SteppingEnabled(), isql.WithPriority(admissionpb.TTLLowPri),
); err != nil {
return spanRowCount, errors.Wrapf(err, "error during row deletion")
}
metrics.RowDeletions.Inc(batchRowCount)
spanRowCount += batchRowCount
}

// Step 3. Early exit if necessary.
Expand Down
7 changes: 6 additions & 1 deletion pkg/sql/ttl/ttljob/ttljob_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/jobs/jobstest"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/scheduledjobs"
"github.com/cockroachdb/cockroach/pkg/sql"
Expand Down Expand Up @@ -76,7 +77,11 @@ func newRowLevelTTLTestJobTestHelper(
),
}

requestFilter, _ := testutils.TestingRequestFilterRetryTxnWithPrefix(t, "ttljob-", 1)
baseTestingKnobs := base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
TestingRequestFilter: requestFilter,
},
JobsTestingKnobs: &jobs.TestingKnobs{
JobSchedulerEnv: th.env,
TakeOverJobsScheduling: func(fn func(ctx context.Context, maxSchedules int64) error) {
Expand Down Expand Up @@ -151,7 +156,7 @@ func (h *rowLevelTTLTestJobTestHelper) waitForScheduledJob(
require.NoError(t, h.executeSchedules())

query := fmt.Sprintf(
`SELECT status, error FROM [SHOW JOBS]
`SELECT status, error FROM [SHOW JOBS]
WHERE job_id IN (
SELECT id FROM %s
WHERE created_by_id IN (SELECT schedule_id FROM %s WHERE executor_type = 'scheduled-row-level-ttl-executor')
Expand Down

0 comments on commit a885652

Please sign in to comment.