Skip to content

Commit

Permalink
backupccl,kvserver: log failed ExportRequest trace on client and server
Browse files Browse the repository at this point in the history
This change strives to improve observability around
backups that fail because of timed out ExportRequests.
Currently, there is very little indication of what the request
was doing when the client cancelled the context after
the pre-determined timeout window. With this change we
now log the trace of the ExportRequest that failed. If
the ExportRequest was served locally, then the trace will be
part of the sender's tracing span. However, if the request
was served on a remote node then the sender does not wait
for the server side evaluation to notice the context cancellation.
To work around this, we also print the trace on the server side
if the request encountered a context cancellation and the
associating tracing span is not a noop.

This change also adds a private cluster setting
`bulkio.backup.export_request_verbose_tracing` that if set to true
will send all backup export requests with verbose tracing
mode.

To debug a backup failing with a timed out export request we
can now:
- SET CLUSTER SETTING bulkio.backup.export_request_verbose_tracing = true;
- SET CLUSTER SETTING trace.snapshot.rate = '1m'

Once the backup times out we can look at the logs
for the server side and client side ExportRequest traces
to then understand where we were stuck executing for so long.

Fixes: cockroachdb#86047
Release note: None
  • Loading branch information
adityamaru committed Jul 12, 2023
1 parent b89808d commit a60dd01
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 8 deletions.
1 change: 1 addition & 0 deletions pkg/ccl/backupccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ go_library(
"//pkg/util/syncutil",
"//pkg/util/timeutil",
"//pkg/util/tracing",
"//pkg/util/tracing/tracingpb",
"//pkg/util/uuid",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_logtags//:logtags",
Expand Down
31 changes: 29 additions & 2 deletions pkg/ccl/backupccl/backup_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/bulk"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
Expand All @@ -40,6 +41,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/cockroachdb/redact"
Expand All @@ -55,7 +57,7 @@ var (
time.Minute,
settings.NonNegativeDuration,
).WithPublic()
delayPerAttmpt = settings.RegisterDurationSetting(
delayPerAttempt = settings.RegisterDurationSetting(
settings.TenantWritable,
"bulkio.backup.read_retry_delay",
"amount of time since the read-as-of time, per-prior attempt, to wait before making another attempt",
Expand All @@ -82,6 +84,13 @@ var (
"split backup data on timestamps when writing revision history",
true,
)

sendExportRequestWithVerboseTracing = settings.RegisterBoolSetting(
settings.TenantWritable,
"bulkio.backup.export_request_verbose_tracing",
"send each export request with a verbose tracing span",
util.ConstantWithMetamorphicTestBool("export_request_verbose_tracing", false),
)
)

const (
Expand Down Expand Up @@ -378,7 +387,7 @@ func runBackupProcessor(
// We're okay with delaying this worker until then since we assume any
// other work it could pull off the queue will likely want to delay to
// a similar or later time anyway.
if delay := delayPerAttmpt.Get(&clusterSettings.SV) - timeutil.Since(span.lastTried); delay > 0 {
if delay := delayPerAttempt.Get(&clusterSettings.SV) - timeutil.Since(span.lastTried); delay > 0 {
timer.Reset(delay)
log.Infof(ctx, "waiting %s to start attempt %d of remaining spans", delay, span.attempts+1)
select {
Expand Down Expand Up @@ -427,11 +436,22 @@ func runBackupProcessor(
log.VEventf(ctx, 1, "sending ExportRequest for span %s (attempt %d, priority %s)",
span.span, span.attempts+1, header.UserPriority.String())
var rawResp kvpb.Response
var recording tracingpb.Recording
var pErr *kvpb.Error
requestSentAt := timeutil.Now()
exportRequestErr := contextutil.RunWithTimeout(ctx,
fmt.Sprintf("ExportRequest for span %s", span.span),
timeoutPerAttempt.Get(&clusterSettings.SV), func(ctx context.Context) error {
sp := tracing.SpanFromContext(ctx)
opts := make([]tracing.SpanOption, 0)
opts = append(opts, tracing.WithParent(sp))
if sendExportRequestWithVerboseTracing.Get(&clusterSettings.SV) {
opts = append(opts, tracing.WithRecording(tracingpb.RecordingVerbose))
}
ctx, exportSpan := sp.Tracer().StartSpanCtx(ctx, "backupccl.ExportRequest", opts...)
defer func() {
recording = exportSpan.FinishAndGetConfiguredRecording()
}()
rawResp, pErr = kv.SendWrappedWithAdmission(
ctx, flowCtx.Cfg.DB.KV().NonTransactionalSender(), header, admissionHeader, req)
if pErr != nil {
Expand All @@ -453,6 +473,9 @@ func runBackupProcessor(
// TimeoutError improves the opaque `context deadline exceeded` error
// message so use that instead.
if errors.HasType(exportRequestErr, (*contextutil.TimeoutError)(nil)) {
if recording != nil {
log.Errorf(ctx, "failed export request for span %s\n trace:\n%s", span.span, recording)
}
return errors.Wrap(exportRequestErr, "export request timeout")
}
// BatchTimestampBeforeGCError is returned if the ExportRequest
Expand All @@ -467,6 +490,10 @@ func runBackupProcessor(
continue
}
}

if recording != nil {
log.Errorf(ctx, "failed export request %s\n trace:\n%s", span.span, recording)
}
return errors.Wrapf(exportRequestErr, "exporting %s", span.span)
}

Expand Down
9 changes: 9 additions & 0 deletions pkg/kv/kvserver/batcheval/cmd_export.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,15 @@ func evalExport(
return result.Result{}, maybeAnnotateExceedMaxSizeError(err)
}
}

// Check if our ctx has been cancelled while we were constructing the SST.
// If it has been cancelled the client is no longer expecting a response.
select {
case <-ctx.Done():
return result.Result{}, ctx.Err()
default:
}

data := destFile.Bytes()

// NB: This should only happen in two cases:
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,10 +441,10 @@ func SendWrappedWith(
return SendWrappedWithAdmission(ctx, sender, h, kvpb.AdmissionHeader{}, args)
}

// SendWrappedWithAdmission is a convenience function which wraps the request
// in a batch and sends it via the provided Sender and headers. It returns the
// unwrapped response or an error. It's valid to pass a `nil` context; an
// empty one is used in that case.
// SendWrappedWithAdmission is a convenience function which wraps the request in
// a batch and sends it via the provided Sender and headers. It returns the
// unwrapped response or an error. It's valid to pass a `nil` context; an empty
// one is used in that case.
func SendWrappedWithAdmission(
ctx context.Context, sender Sender, h kvpb.Header, ah kvpb.AdmissionHeader, args kvpb.Request,
) (kvpb.Response, *kvpb.Error) {
Expand Down
11 changes: 11 additions & 0 deletions pkg/server/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,17 @@ func (n *Node) batchInternal(
tracing.SpanFromContext(ctx).MaybeRecordStackHistory(tStart)
}

// If the sender cancelled the context they may not wait around for the
// replica to notice the cancellation and return a response. For this reason,
// we log the server-side trace of the cancelled request to help debug what
// the request was doing at the time it noticed the cancellation.
if pErr != nil && errors.IsAny(pErr.GoError(), context.Canceled, context.DeadlineExceeded) {
if sp := tracing.SpanFromContext(ctx); sp != nil && !sp.IsNoop() {
log.Infof(ctx, "batch request %s failed with error: %s\ntrace:\n%s", args.String(),
pErr.GoError().Error(), sp.GetConfiguredRecording().String())
}
}

n.metrics.callComplete(timeutil.Since(tStart), pErr)
br.Error = pErr

Expand Down
14 changes: 12 additions & 2 deletions pkg/sql/sem/builtins/fingerprint_builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
"github.com/cockroachdb/errors"
)

Expand Down Expand Up @@ -134,19 +136,27 @@ func fingerprint(
ExportFingerprint: true,
FingerprintOptions: kvpb.FingerprintOptions{StripIndexPrefixAndTimestamp: stripped}}
var rawResp kvpb.Response
var recording tracingpb.Recording
var pErr *kvpb.Error
exportRequestErr := contextutil.RunWithTimeout(ctx,
fmt.Sprintf("ExportRequest fingerprint for span %s", roachpb.Span{Key: span.Key,
EndKey: span.EndKey}),
5*time.Minute, func(ctx context.Context) error {
rawResp, pErr = kv.SendWrappedWithAdmission(ctx,
evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req)
sp := tracing.SpanFromContext(ctx)
ctx, exportSpan := sp.Tracer().StartSpanCtx(ctx, "fingerprint.ExportRequest", tracing.WithParent(sp))
defer func() {
recording = exportSpan.FinishAndGetConfiguredRecording()
}()
rawResp, pErr = kv.SendWrappedWithAdmission(ctx, evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req)
if pErr != nil {
return pErr.GoError()
}
return nil
})
if exportRequestErr != nil {
if recording != nil {
log.Errorf(ctx, "failed export request trace:\n%s", recording)
}
return nil, exportRequestErr
}

Expand Down

0 comments on commit a60dd01

Please sign in to comment.