Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
60947: bulkio: make backup, restore, and import more resilient to node failures r=pbardea a=pbardea

This PR extracts error helpers that were specific to the changefeedccl package, so that they can
be used by all other jobs that run long-running DistSQL flows. This allows all jobs to be resilient to
node failures.

This commit does not fix that less-than-ideal error handling used to detect RPC failures from
DistSQL flow, but does aim to centralize the checks so that they occur only in 1 place.

Informs #50732.

Please see individual commits.

Release-justification: high impact fix

62452: ui: Replace "Admin UI" with "DB Console" on login r=nathanstilwell a=nathanstilwell

Changed a few remaining instances of "Admin UI" to "DB Console" that
were spotted on the login page.

Release note (ui change): Change copy that referred to the app as "Admin
UI" to "DB Console" instead

62870: Revert "sql: lease acquisition of OFFLINE descs may starve bulk operations" r=ajwerner a=fqazi

Fixes: #62864, #62853, #62849, #62844

Reverts offline descriptor leasing change to fix
intermittent failures introduced inside the
importccl tests.

Release note: None

Co-authored-by: Paul Bardea <[email protected]>
Co-authored-by: Nathan Stilwell <[email protected]>
Co-authored-by: Faizan Qazi <[email protected]>
  • Loading branch information
4 people committed Mar 31, 2021
4 parents a49cd0a + 7fd01c5 + fdc86d9 + 3e1c9f1 commit 46a8edd
Show file tree
Hide file tree
Showing 18 changed files with 607 additions and 242 deletions.
1 change: 1 addition & 0 deletions pkg/ccl/backupccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ go_library(
"//pkg/util/log",
"//pkg/util/metric",
"//pkg/util/protoutil",
"//pkg/util/retry",
"//pkg/util/syncutil",
"//pkg/util/timeutil",
"//pkg/util/tracing",
Expand Down
64 changes: 48 additions & 16 deletions pkg/ccl/backupccl/backup_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
Expand All @@ -38,6 +39,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/storage/cloudimpl"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -439,23 +441,53 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
}

statsCache := p.ExecCfg().TableStatsCache
res, err := backup(
ctx,
p,
details.URI,
details.URIsByLocalityKV,
p.ExecCfg().DB,
p.ExecCfg().Settings,
defaultStore,
storageByLocalityKV,
b.job,
backupManifest,
p.ExecCfg().DistSQLSrv.ExternalStorage,
details.EncryptionOptions,
statsCache,
)
// We retry on pretty generic failures -- any rpc error. If a worker node were
// to restart, it would produce this kind of error, but there may be other
// errors that are also rpc errors. Don't retry to aggressively.
retryOpts := retry.Options{
MaxBackoff: 1 * time.Second,
MaxRetries: 5,
}

// We want to retry a backup if there are transient failures (i.e. worker nodes
// dying), so if we receive a retryable error, re-plan and retry the backup.
var res RowCount
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
res, err = backup(
ctx,
p,
details.URI,
details.URIsByLocalityKV,
p.ExecCfg().DB,
p.ExecCfg().Settings,
defaultStore,
storageByLocalityKV,
b.job,
backupManifest,
p.ExecCfg().DistSQLSrv.ExternalStorage,
details.EncryptionOptions,
statsCache,
)
if err == nil {
break
}

if !utilccl.IsDistSQLRetryableError(err) {
return errors.Wrap(err, "failed to run backup")
}

log.Warningf(ctx, `BACKUP job encountered retryable error: %+v`, err)

// Reload the backup manifest to pick up any spans we may have completed on
// previous attempts.
var reloadBackupErr error
backupManifest, reloadBackupErr = b.readManifestOnResume(ctx, p.ExecCfg(), defaultStore, details)
if reloadBackupErr != nil {
log.Warning(ctx, "could not reload backup manifest when retrying, continuing with old progress")
}
}
if err != nil {
return errors.Wrap(err, "failed to run backup")
return errors.Wrap(err, "exhausted retries")
}

b.deleteCheckpoint(ctx, p.ExecCfg(), p.User())
Expand Down
57 changes: 55 additions & 2 deletions pkg/ccl/backupccl/restore_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ import (
"context"
"fmt"
"math"
"time"

"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/ccl/storageccl"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/keys"
Expand Down Expand Up @@ -48,6 +50,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/interval"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
Expand Down Expand Up @@ -500,6 +503,56 @@ func rewriteBackupSpanKey(
return newKey, nil
}

func restoreWithRetry(
restoreCtx context.Context,
execCtx sql.JobExecContext,
backupManifests []BackupManifest,
backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo,
endTime hlc.Timestamp,
dataToRestore restorationData,
job *jobs.Job,
encryption *jobspb.BackupEncryptionOptions,
) (RowCount, error) {
// We retry on pretty generic failures -- any rpc error. If a worker node were
// to restart, it would produce this kind of error, but there may be other
// errors that are also rpc errors. Don't retry to aggressively.
retryOpts := retry.Options{
MaxBackoff: 1 * time.Second,
MaxRetries: 5,
}

// We want to retry a restore if there are transient failures (i.e. worker nodes
// dying), so if we receive a retryable error, re-plan and retry the backup.
var res RowCount
var err error
for r := retry.StartWithCtx(restoreCtx, retryOpts); r.Next(); {
res, err = restore(
restoreCtx,
execCtx,
backupManifests,
backupLocalityInfo,
endTime,
dataToRestore,
job,
encryption,
)
if err == nil {
break
}

if !utilccl.IsDistSQLRetryableError(err) {
return RowCount{}, err
}

log.Warningf(restoreCtx, `encountered retryable error: %+v`, err)
}

if err != nil {
return RowCount{}, errors.Wrap(err, "exhausted retries")
}
return res, nil
}

// restore imports a SQL table (or tables) from sets of non-overlapping sstable
// files.
func restore(
Expand Down Expand Up @@ -1463,7 +1516,7 @@ func (r *restoreResumer) Resume(ctx context.Context, execCtx interface{}) error

var resTotal RowCount
if !preData.isEmpty() {
res, err := restore(
res, err := restoreWithRetry(
ctx,
p,
backupManifests,
Expand Down Expand Up @@ -1497,7 +1550,7 @@ func (r *restoreResumer) Resume(ctx context.Context, execCtx interface{}) error
{
// Restore the main data bundle. We notably only restore the system tables
// later.
res, err := restore(
res, err := restoreWithRetry(
ctx,
p,
backupManifests,
Expand Down
10 changes: 3 additions & 7 deletions pkg/ccl/changefeedccl/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"reflect"
"strings"

"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
"github.com/cockroachdb/errors"
)

Expand Down Expand Up @@ -60,13 +61,8 @@ func IsRetryableError(err error) bool {
// unfortunate string comparison.
return true
}
if strings.Contains(errStr, `rpc error`) {
// When a crdb node dies, any DistSQL flows with processors scheduled on
// it get an error with "rpc error" in the message from the call to
// `(*DistSQLPlanner).Run`.
return true
}
return false

return utilccl.IsDistSQLRetryableError(err)
}

// MaybeStripRetryableErrorMarker performs some minimal attempt to clean the
Expand Down
46 changes: 45 additions & 1 deletion pkg/ccl/importccl/import_stmt.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"sort"
"strconv"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/ccl/backupccl"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
Expand Down Expand Up @@ -1998,11 +1999,12 @@ func (r *importResumer) Resume(ctx context.Context, execCtx interface{}) error {
}
}

res, err := sql.DistIngest(ctx, p, r.job, tables, files, format, details.Walltime,
res, err := ingestWithRetry(ctx, p, r.job, tables, files, format, details.Walltime,
r.testingKnobs.alwaysFlushJobProgress)
if err != nil {
return err
}

pkIDs := make(map[uint64]struct{}, len(details.Tables))
for _, t := range details.Tables {
pkIDs[roachpb.BulkOpSummaryID(uint64(t.Desc.ID), uint64(t.Desc.PrimaryIndex.ID))] = struct{}{}
Expand Down Expand Up @@ -2060,6 +2062,48 @@ func (r *importResumer) Resume(ctx context.Context, execCtx interface{}) error {
return nil
}

func ingestWithRetry(
ctx context.Context,
execCtx sql.JobExecContext,
job *jobs.Job,
tables map[string]*execinfrapb.ReadImportDataSpec_ImportTable,
from []string,
format roachpb.IOFileFormat,
walltime int64,
alwaysFlushProgress bool,
) (roachpb.BulkOpSummary, error) {

// We retry on pretty generic failures -- any rpc error. If a worker node were
// to restart, it would produce this kind of error, but there may be other
// errors that are also rpc errors. Don't retry to aggressively.
retryOpts := retry.Options{
MaxBackoff: 1 * time.Second,
MaxRetries: 5,
}

// We want to retry a restore if there are transient failures (i.e. worker nodes
// dying), so if we receive a retryable error, re-plan and retry the backup.
var res roachpb.BulkOpSummary
var err error
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
res, err = sql.DistIngest(ctx, execCtx, job, tables, from, format, walltime, alwaysFlushProgress)
if err == nil {
break
}

if !utilccl.IsDistSQLRetryableError(err) {
return roachpb.BulkOpSummary{}, err
}

log.Warningf(ctx, `encountered retryable error: %+v`, err)
}

if err != nil {
return roachpb.BulkOpSummary{}, errors.Wrap(err, "exhausted retries")
}
return res, nil
}

func (r *importResumer) publishSchemas(ctx context.Context, execCfg *sql.ExecutorConfig) error {
details := r.job.Details().(jobspb.ImportDetails)
// Schemas should only be published once.
Expand Down
1 change: 1 addition & 0 deletions pkg/ccl/utilccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "utilccl",
srcs = [
"errors.go",
"jobutils.go",
"license_check.go",
],
Expand Down
30 changes: 30 additions & 0 deletions pkg/ccl/utilccl/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2021 The Cockroach Authors.
//
// Licensed as a CockroachDB Enterprise file under the Cockroach Community
// License (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt

package utilccl

import "strings"

// IsDistSQLRetryableError returns true if the supplied error, or any of its parent
// causes is an rpc error.
// This is an unfortunate implementation that should be looking for a more
// specific error.
func IsDistSQLRetryableError(err error) bool {
if err == nil {
return false
}

// TODO(knz): this is a bad implementation. Make it go away
// by avoiding string comparisons.

errStr := err.Error()
// When a crdb node dies, any DistSQL flows with processors scheduled on
// it get an error with "rpc error" in the message from the call to
// `(*DistSQLPlanner).Run`.
return strings.Contains(errStr, `rpc error`)
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ go_library(
"inverted_index.go",
"java_helpers.go",
"jepsen.go",
"jobs.go",
"kv.go",
"kvbench.go",
"ledger.go",
Expand Down
Loading

0 comments on commit 46a8edd

Please sign in to comment.