Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
68995: sql: add columns in jobs virtual table for overview in DBConsole r=ajwerner a=sajjadrizvi

This commit adds new columns in `crdb_internal.jobs` table, which
show the current exponential-backoff state of a job and its execution
history.

Release justification: This commit adds low-risk updates to new
functionality. Jobs subsystem now supports job retries with
exponential-backoff. We want to give users more insights
about the backoff state of jobs and jobs' lifecycles through
additional columns in `crdb_internal.jobs` table.

Release note (general change): The functionality to retry failed
jobs with exponential-backoff has introduced recently in the system.
This commit adds new columns in `crdb_internal.jobs` table, which
show the current backoff-state of a job and its execution log. The
execution log consists of a sequence of job start and end events
and any associated errors that were encountered during the job's
each execution. Now users can query internal jobs table to get
more insights about jobs through the following columns: (a) `last_run`
shows the last execution time of a job, (b) `next_run` shows the
next execution time of a job based on exponential-backoff delay,
(c) `num_runs` shows the number of times the job has been executed,
and (d) `execution_log` provides a set of events that are generated
when a job starts and ends its execution.

Relates to #68179

69044: storageccl: remove non-ReturnSST ExportRequest r=dt a=dt

Release justification: bug fix in new functionality.

Release note: none.

69239: roachtest: move roachtest stress CI job instructions to README r=tbg,stevendanna a=erikgrinaker

Release justification: non-production code changes
Release note: None

69285: roachtest: increase consistency check timeout, and ignore errors r=tbg a=erikgrinaker

This bumps the consistency check timeout to 5 minutes. There are
indications that a recent libpq upgrade unmasked previously ignored
context cancellation errors, caused by the timeout here being too low.
It also ignores errors during the consistency check, since it is
best-effort anyway.

Resolves #68883.

Release justification: non-production code changes
Release note: None

Co-authored-by: Sajjad Rizvi <[email protected]>
Co-authored-by: David Taylor <[email protected]>
Co-authored-by: Erik Grinaker <[email protected]>
  • Loading branch information
4 people committed Aug 24, 2021
5 parents ec0fa4e + 3ccdba8 + 67ad54f + a452dbc + 6b89733 commit 7c36a9d
Show file tree
Hide file tree
Showing 20 changed files with 1,110 additions and 532 deletions.
1 change: 1 addition & 0 deletions pkg/ccl/backupccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ go_library(
"//pkg/sql/roleoption",
"//pkg/sql/rowenc",
"//pkg/sql/rowexec",
"//pkg/sql/sem/builtins",
"//pkg/sql/sem/tree",
"//pkg/sql/sessiondata",
"//pkg/sql/sqlerrors",
Expand Down
9 changes: 8 additions & 1 deletion pkg/ccl/backupccl/backup_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
Expand Down Expand Up @@ -631,7 +632,7 @@ func (s *sstSink) flushFile(ctx context.Context) error {
}

func (s *sstSink) open(ctx context.Context) error {
s.outName = storageccl.GenerateUniqueSSTName(s.conf.id)
s.outName = generateUniqueSSTName(s.conf.id)
if s.ctx == nil {
s.ctx, s.cancel = context.WithCancel(ctx)
}
Expand Down Expand Up @@ -740,6 +741,12 @@ func (s *sstSink) write(ctx context.Context, resp returnedSST) error {
return nil
}

func generateUniqueSSTName(nodeID base.SQLInstanceID) string {
// The data/ prefix, including a /, is intended to group SSTs in most of the
// common file/bucket browse UIs.
return fmt.Sprintf("data/%d.sst", builtins.GenerateUniqueInt(nodeID))
}

func init() {
rowexec.NewBackupDataProcessor = newBackupDataProcessor
}
2 changes: 0 additions & 2 deletions pkg/ccl/storageccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@ go_library(
"//pkg/roachpb:with-mocks",
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/sql/sem/builtins",
"//pkg/storage",
"//pkg/util/hlc",
"//pkg/util/humanizeutil",
"//pkg/util/log",
"//pkg/util/retry",
"//pkg/util/tracing",
"@com_github_cockroachdb_errors//:errors",
Expand Down
136 changes: 14 additions & 122 deletions pkg/ccl/storageccl/export.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,17 @@
package storageccl

import (
"bytes"
"context"
"fmt"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/cloud"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/errors"
"github.com/gogo/protobuf/types"
Expand Down Expand Up @@ -63,8 +57,6 @@ var ExportRequestMaxAllowedFileSizeOverage = settings.RegisterByteSizeSetting(
64<<20, /* 64 MiB */
).WithPublic()

const maxUploadRetries = 5

func init() {
batcheval.RegisterReadOnlyCommand(roachpb.Export, declareKeysExport, evalExport)
}
Expand Down Expand Up @@ -101,6 +93,14 @@ func evalExport(
}
evalExportSpan.RecordStructured(&evalExportTrace)

if !args.ReturnSST {
return result.Result{}, errors.New("ReturnSST is required")
}

if args.Encryption != nil {
return result.Result{}, errors.New("returned SSTs cannot be encrypted")
}

// For MVCC_All backups with no start time, they'll only be capturing the
// *revisions* since the gc threshold, so noting that in the reply allows the
// BACKUP to correctly note the supported time bounds for RESTORE AS OF SYSTEM
Expand All @@ -109,47 +109,6 @@ func evalExport(
reply.StartTime = cArgs.EvalCtx.GetGCThreshold()
}

makeExternalStorage := !args.ReturnSST || args.Storage != roachpb.ExternalStorage{} ||
(args.StorageByLocalityKV != nil && len(args.StorageByLocalityKV) > 0)
if makeExternalStorage || log.V(1) {
log.Infof(ctx, "export [%s,%s)", args.Key, args.EndKey)
} else {
// Requests that don't write to export storage are expected to be small.
log.Eventf(ctx, "export [%s,%s)", args.Key, args.EndKey)
}

if makeExternalStorage {
if _, ok := roachpb.TenantFromContext(ctx); ok {
if args.Storage.Provider == roachpb.ExternalStorageProvider_userfile {
return result.Result{}, errors.Errorf("requests to userfile on behalf of tenants must be made by the tenant's SQL process")
}
}
}

// To get the store to export to, first try to match the locality of this node
// to the locality KVs in args.StorageByLocalityKV (used for partitioned
// backups). If that map isn't set or there's no match, fall back to
// args.Storage.
var localityKV string
var exportStore cloud.ExternalStorage
if makeExternalStorage {
var storeConf roachpb.ExternalStorage
var err error
foundStoreByLocality := false
if args.StorageByLocalityKV != nil && len(args.StorageByLocalityKV) > 0 {
locality := cArgs.EvalCtx.GetNodeLocality()
localityKV, storeConf, foundStoreByLocality = getMatchingStore(&locality, args.StorageByLocalityKV)
}
if !foundStoreByLocality {
storeConf = args.Storage
}
exportStore, err = cArgs.EvalCtx.GetExternalStorage(ctx, storeConf)
if err != nil {
return result.Result{}, err
}
defer exportStore.Close()
}

var exportAllRevisions bool
switch args.MVCCFilter {
case roachpb.MVCCFilter_Latest:
Expand Down Expand Up @@ -181,11 +140,9 @@ func evalExport(
// Only use resume timestamp if splitting mid key is enabled.
resumeKeyTS := hlc.Timestamp{}
if args.SplitMidKey {
if !args.ReturnSST {
return result.Result{}, errors.New("SplitMidKey could only be used with ReturnSST option")
}
resumeKeyTS = args.ResumeKeyTS
}

var curSizeOfExportedSSTs int64
for start := args.Key; start != nil; {
destFile := &storage.MemFile{}
Expand Down Expand Up @@ -214,57 +171,16 @@ func evalExport(
span.EndKey = args.EndKey
}
exported := roachpb.ExportResponse_File{
Span: span,
EndKeyTS: resumeTS,
Exported: summary,
LocalityKV: localityKV,
}

returnSST := args.ReturnSST
if args.ReturnSstBelowSize > 0 && len(data) < int(args.ReturnSstBelowSize) {
returnSST = true
}

if returnSST {
exported.SST = data
} else {
if args.Encryption != nil {
data, err = EncryptFile(data, args.Encryption.Key)
if err != nil {
return result.Result{}, err
}
}

exported.Path = GenerateUniqueSSTName(base.SQLInstanceID(cArgs.EvalCtx.NodeID()))
var attemptNum int
if err := retry.WithMaxAttempts(ctx, base.DefaultRetryOptions(), maxUploadRetries, func() error {
attemptNum++
retryTracingEvent := roachpb.RetryTracingEvent{
Operation: fmt.Sprintf("%s.ExportRequest.WriteFile", exportStore.Conf().Provider.String()),
AttemptNumber: int32(attemptNum),
}
// We blindly retry any error here because we expect the caller to have
// verified the target is writable before sending ExportRequests for it.
if err := cloud.WriteFile(ctx, exportStore, exported.Path, bytes.NewReader(data)); err != nil {
log.VEventf(ctx, 1, "failed to put file: %+v", err)
retryTracingEvent.RetryError = fmt.Sprintf("failed to put file: %s", tracing.RedactAndTruncateError(err))
evalExportSpan.RecordStructured(&retryTracingEvent)
return err
}
evalExportSpan.RecordStructured(&retryTracingEvent)
return nil
}); err != nil {
return result.Result{}, err
}
Span: span,
EndKeyTS: resumeTS,
Exported: summary,
SST: data,
}
reply.Files = append(reply.Files, exported)
start = resume
resumeKeyTS = resumeTS

// If we are not returning the SSTs to the processor, there is no need to
// paginate the ExportRequest since the reply size will not grow large
// enough to cause an OOM.
if args.ReturnSST && h.TargetBytes > 0 {
if h.TargetBytes > 0 {
curSizeOfExportedSSTs += summary.DataSize
// There could be a situation where the size of exported SSTs is larger
// than the TargetBytes. In such a scenario, we want to report back
Expand Down Expand Up @@ -307,27 +223,3 @@ func evalExport(

return result.Result{}, nil
}

func getMatchingStore(
locality *roachpb.Locality, storageByLocalityKV map[string]*roachpb.ExternalStorage,
) (string, roachpb.ExternalStorage, bool) {
kvs := locality.Tiers
// When matching, more specific KVs in the node locality take precedence
// over less specific ones.
for i := len(kvs) - 1; i >= 0; i-- {
if store, ok := storageByLocalityKV[kvs[i].String()]; ok {
return kvs[i].String(), *store, true
}
}
return "", roachpb.ExternalStorage{}, false
}

// GenerateUniqueSSTName generates a name for a backup SST that will not collide
// with another name generated by this node or another node.
func GenerateUniqueSSTName(nodeID base.SQLInstanceID) string {
// The data/ prefix, including a /, is intended to group SSTs in most of the
// common file/bucket browse UIs.
// TODO(dt): don't reach out into a SQL builtin here; this code lives in KV.
// Create a unique int differently.
return fmt.Sprintf("data/%d.sst", builtins.GenerateUniqueInt(nodeID))
}
12 changes: 12 additions & 0 deletions pkg/cmd/roachtest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,22 @@ The HTTP endpoint (by default `:8080`) is useful to find the "run" numbers for
failing tests (to find the artifacts) and to get a general overview of the
progress of the invocation.

### Stressing a roachtest

A solid foundation for building the binaries and stressing a roachtest is
provided via the [roachstress.sh] script, which can either be used outright or
saved and adjusted. The script can be invoked without parameters from a clean
checkout of the cockroach repository at the revision to be tested. It will
prompt for user input on which test to stress.

[roachstress.sh]: https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachstress.sh

Another option is to start a [`Cockroach_Nightlies_RoachtestStress`](https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestStress)
CI job, which allows running a bunch of tests without having to keep your
laptop online. The CI job is run as follows:

1. Go to https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestStress
2. Click the ellipsis (...) next to the Run button and fill in:
* Changes → Build branch: `<branch>`
* Parameters → `env.TESTS`: `^<test>$`
* Parameters → `env.COUNT`: `<runs>`
12 changes: 7 additions & 5 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1266,7 +1266,7 @@ func (c *clusterImpl) CheckReplicaDivergenceOnDB(
//
// We've seen the consistency checks hang indefinitely in some cases.
rows, err := db.QueryContext(ctx, `
SET statement_timeout = '3m';
SET statement_timeout = '5m';
SELECT t.range_id, t.start_key_pretty, t.status, t.detail
FROM
crdb_internal.check_consistency(true, '', '') as t
Expand All @@ -1278,20 +1278,22 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
l.Printf("consistency check failed with %v; ignoring", err)
return nil
}
defer rows.Close()
var finalErr error
for rows.Next() {
var rangeID int32
var prettyKey, status, detail string
if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil {
return scanErr
l.Printf("consistency check failed with %v; ignoring", scanErr)
return nil
}
finalErr = errors.CombineErrors(finalErr,
errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail))
}
if err := rows.Err(); err != nil {
finalErr = errors.CombineErrors(finalErr, err)
l.Printf("consistency check failed with %v; ignoring", err)
return nil
}

return finalErr
}

Expand Down Expand Up @@ -1330,7 +1332,7 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t test.Test)
defer db.Close()

if err := contextutil.RunWithTimeout(
ctx, "consistency check", time.Minute,
ctx, "consistency check", 5*time.Minute,
func(ctx context.Context) error {
return c.CheckReplicaDivergenceOnDB(ctx, t.L(), db)
},
Expand Down
12 changes: 1 addition & 11 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -946,18 +946,8 @@ func (r *testRunner) maybePostGithubIssue(
ReproductionCommand: func(renderer *issues.Renderer) {
issues.ReproductionAsLink(
"roachtest README",
"https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/roachtest",
"https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md",
)(renderer)
issues.ReproductionAsLink(
"CI job to stress roachtests",
"https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestStress",
)(renderer)
renderer.P(func() {
renderer.Escaped("For the CI stress job, click the ellipsis (...) next to the Run button and fill in:\n")
renderer.Escaped(fmt.Sprintf("* Changes / Build branch: %s\n", branch))
renderer.Escaped(fmt.Sprintf("* Parameters / `env.TESTS`: `^%s$`\n", t.Name()))
renderer.Escaped("* Parameters / `env.COUNT`: <number of runs>\n")
})
},
}
if err := issues.Post(
Expand Down
Loading

0 comments on commit 7c36a9d

Please sign in to comment.