Skip to content

Commit

Permalink
backuppcl: pass finishesSpec field to resume span
Browse files Browse the repository at this point in the history
PR #114268 broke the plumbing for the CompletedSpans metric which allows the
backup coordinator to update the FractionCompleted metric. This patch fixes
this bug by passing the finishesSpec field to the resume span.

Informs #120161

Release note: none
  • Loading branch information
msbutler committed Mar 13, 2024
1 parent c239d52 commit 6895727
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 8 deletions.
13 changes: 7 additions & 6 deletions pkg/ccl/backupccl/backup_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,12 +570,13 @@ func runBackupProcessor(
resumeTS = resp.Files[fileCount-1].EndKeyTS
}
resumeSpan = spanAndTime{
span: *resp.ResumeSpan,
firstKeyTS: resumeTS,
start: span.start,
end: span.end,
attempts: span.attempts,
lastTried: span.lastTried,
span: *resp.ResumeSpan,
firstKeyTS: resumeTS,
start: span.start,
end: span.end,
attempts: span.attempts,
lastTried: span.lastTried,
finishesSpec: span.finishesSpec,
}
}

Expand Down
12 changes: 10 additions & 2 deletions pkg/cmd/roachtest/tests/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,8 +408,16 @@ func registerBackup(r registry.Registry) {
// total elapsed time. This is used by roachperf to compute and display
// the average MB/sec per node.
tick()
c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
BACKUP bank.bank TO 'gs://cockroachdb-backup-testing/`+dest+`?AUTH=implicit'"`)
conn := c.Conn(ctx, t.L(), 1)
defer conn.Close()
var jobID jobspb.JobID
uri := `gs://cockroachdb-backup-testing/` + dest + `?AUTH=implicit`
if err := conn.QueryRowContext(ctx, fmt.Sprintf("BACKUP bank.bank INTO '%s' WITH detached", uri)).Scan(&jobID); err != nil {
return err
}
if err := AssertReasonableFractionCompleted(ctx, t.L(), c, jobID, 2); err != nil {
return err
}
tick()

// Upload the perf artifacts to any one of the nodes so that the test
Expand Down
56 changes: 56 additions & 0 deletions pkg/cmd/roachtest/tests/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
Expand Down Expand Up @@ -170,3 +171,58 @@ func getJobProgress(t test.Test, db *sqlutils.SQLRunner, jobID jobspb.JobID) *jo
}
return ret
}

func AssertReasonableFractionCompleted(
ctx context.Context, l *logger.Logger, c cluster.Cluster, jobID jobspb.JobID, nodeToQuery int,
) error {
ticker := time.NewTicker(15 * time.Second)
defer ticker.Stop()
fractionsRecorded := make([]float64, 0)

for {
select {
case <-ticker.C:
fractionCompleted, err := getFractionProgressed(ctx, l, c, jobID, nodeToQuery)
if err != nil {
return err
}
fractionsRecorded = append(fractionsRecorded, fractionCompleted)
if fractionCompleted == 1 {
count := len(fractionsRecorded)
if count > 5 && fractionsRecorded[count/2] < 0.2 && fractionsRecorded[count/2] > 0.8 {
return errors.Newf("the median fraction completed was %.2f, which is outside (0.2,0.8)", fractionsRecorded[count/2])
}
l.Printf("not enough 'fractionCompleted' recorded to assert progress looks sane")
return nil
}
case <-ctx.Done():
return errors.Wrap(ctx.Err(), "context canceled while waiting for job to finish")
}
}
}

func getFractionProgressed(
ctx context.Context, l *logger.Logger, c cluster.Cluster, jobID jobspb.JobID, nodeToQuery int,
) (float64, error) {
var status string
var fractionCompleted float64
conn := c.Conn(ctx, l, nodeToQuery)
defer conn.Close()
err := conn.QueryRowContext(ctx, `SELECT status, fraction_completed FROM [SHOW JOB $1]`, jobID).Scan(&status, &fractionCompleted)
if err != nil {
return 0, errors.Wrap(err, "getting the job status and fraction completed")
}
jobStatus := jobs.Status(status)
switch jobStatus {
case jobs.StatusSucceeded:
if fractionCompleted != 1 {
return 0, errors.Newf("job completed but fraction completed is %.2f", fractionCompleted)
}
return fractionCompleted, nil
case jobs.StatusRunning:
l.Printf("job %d still running, %.2f completed, waiting to succeed", jobID, fractionCompleted)
return fractionCompleted, nil
default:
return 0, errors.Newf("unexpectedly found job %s in state %s", jobID, status)
}
}

0 comments on commit 6895727

Please sign in to comment.