Skip to content

Commit

Permalink
Merge #65797
Browse files Browse the repository at this point in the history
65797: backupccl: fix bug in failing backups r=pbardea a=pbardea

Fixes a bug where backups progress updating goroutine may continue
running even after the backup command itself fails. At worst, this results
in a panic inside Google Cloud Storage's SDK since we attempt to write
to a destination that was previously closed upon the backup failure.

Release note (bug fix): fix a bug that could cause a node to crash in rare
cases if a BACKUP writing to Google Cloud Storage failed

Co-authored-by: Paul Bardea <[email protected]>
  • Loading branch information
craig[bot] and pbardea committed May 28, 2021
2 parents 7be33c1 + 60b88a8 commit 7e8a568
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 34 deletions.
30 changes: 15 additions & 15 deletions pkg/ccl/backupccl/backup_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ func backup(
spans := filterSpans(backupManifest.Spans, completedSpans)
introducedSpans := filterSpans(backupManifest.IntroducedSpans, completedIntroducedSpans)

g := ctxgroup.WithContext(ctx)
pkIDs := make(map[uint64]bool)
for i := range backupManifest.Descriptors {
if t, _, _, _ := descpb.FromDescriptor(&backupManifest.Descriptors[i]); t != nil {
Expand Down Expand Up @@ -227,17 +226,18 @@ func backup(
progressLogger := jobs.NewChunkProgressLogger(job, numTotalSpans, job.FractionCompleted(), jobs.ProgressUpdateOnly)

requestFinishedCh := make(chan struct{}, numTotalSpans) // enough buffer to never block
var jobProgressLoop func(ctx context.Context) error
if numTotalSpans > 0 {
g.GoCtx(func(ctx context.Context) error {
jobProgressLoop = func(ctx context.Context) error {
// Currently the granularity of backup progress is the % of spans
// exported. Would improve accuracy if we tracked the actual size of each
// file.
return progressLogger.Loop(ctx, requestFinishedCh)
})
}
}

progCh := make(chan *execinfrapb.RemoteProducerMetadata_BulkProcessorProgress)
g.GoCtx(func(ctx context.Context) error {
checkpointLoop := func(ctx context.Context) error {
// When a processor is done exporting a span, it will send a progress update
// to progCh.
for progress := range progCh {
Expand Down Expand Up @@ -267,20 +267,20 @@ func backup(
}
}
return nil
})
}

if err := distBackup(
ctx,
execCtx,
planCtx,
dsp,
progCh,
backupSpecs,
); err != nil {
return RowCount{}, err
runBackup := func(ctx context.Context) error {
return distBackup(
ctx,
execCtx,
planCtx,
dsp,
progCh,
backupSpecs,
)
}

if err := g.Wait(); err != nil {
if err := ctxgroup.GoAndWait(ctx, jobProgressLoop, checkpointLoop, runBackup); err != nil {
return RowCount{}, errors.Wrapf(err, "exporting %d ranges", errors.Safe(numTotalSpans))
}

Expand Down
35 changes: 16 additions & 19 deletions pkg/ccl/backupccl/restore_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,8 +678,6 @@ func restore(
}
})

g := ctxgroup.WithContext(restoreCtx)

// TODO(pbardea): This not super principled. I just wanted something that
// wasn't a constant and grew slower than linear with the length of
// importSpans. It seems to be working well for BenchmarkRestore2TB but
Expand All @@ -703,15 +701,15 @@ func restore(
}

requestFinishedCh := make(chan struct{}, len(importSpans)) // enough buffer to never block
g.GoCtx(func(ctx context.Context) error {
jobProgressLoop := func(ctx context.Context) error {
ctx, progressSpan := tracing.ChildSpan(ctx, "progress-log")
defer progressSpan.Finish()
return progressLogger.Loop(ctx, requestFinishedCh)
})
}

progCh := make(chan *execinfrapb.RemoteProducerMetadata_BulkProcessorProgress)

g.GoCtx(func(ctx context.Context) error {
jobCheckpointLoop := func(ctx context.Context) error {
// When a processor is done importing a span, it will send a progress update
// to progCh.
for progress := range progCh {
Expand Down Expand Up @@ -742,23 +740,22 @@ func restore(
requestFinishedCh <- struct{}{}
}
return nil
})
}

// TODO(pbardea): Improve logging in processors.
if err := distRestore(
restoreCtx,
execCtx,
importSpanChunks,
dataToRestore.getPKIDs(),
encryption,
dataToRestore.getRekeys(),
endTime,
progCh,
); err != nil {
return emptyRowCount, err
runRestore := func(ctx context.Context) error {
return distRestore(
ctx,
execCtx,
importSpanChunks,
dataToRestore.getPKIDs(),
encryption,
dataToRestore.getRekeys(),
endTime,
progCh,
)
}

if err := g.Wait(); err != nil {
if err := ctxgroup.GoAndWait(restoreCtx, jobProgressLoop, jobCheckpointLoop, runRestore); err != nil {
// This leaves the data that did get imported in case the user wants to
// retry.
// TODO(dan): Build tooling to allow a user to restart a failed restore.
Expand Down
13 changes: 13 additions & 0 deletions pkg/util/ctxgroup/ctxgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,16 @@ func GroupWorkers(ctx context.Context, num int, f func(context.Context, int) err
}
return group.Wait()
}

// GoAndWait calls the given functions each in a new goroutine. It then Waits
// for them to finish. This is intended to help prevent bugs caused by returning
// early after running some goroutines but before Waiting for them to complete.
func GoAndWait(ctx context.Context, fs ...func(ctx context.Context) error) error {
group := WithContext(ctx)
for _, f := range fs {
if f != nil {
group.GoCtx(f)
}
}
return group.Wait()
}

0 comments on commit 7e8a568

Please sign in to comment.