Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backupccl,kvserver: log failed ExportRequest trace on client and server #102793

Merged
merged 1 commit into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/ccl/backupccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ go_library(
"//pkg/util/syncutil",
"//pkg/util/timeutil",
"//pkg/util/tracing",
"//pkg/util/tracing/tracingpb",
"//pkg/util/uuid",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_logtags//:logtags",
Expand Down
29 changes: 27 additions & 2 deletions pkg/ccl/backupccl/backup_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/bulk"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
Expand All @@ -40,6 +41,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/cockroachdb/redact"
Expand All @@ -55,7 +57,7 @@ var (
time.Minute,
settings.NonNegativeDuration,
).WithPublic()
delayPerAttmpt = settings.RegisterDurationSetting(
delayPerAttempt = settings.RegisterDurationSetting(
settings.TenantWritable,
"bulkio.backup.read_retry_delay",
"amount of time since the read-as-of time, per-prior attempt, to wait before making another attempt",
Expand All @@ -82,6 +84,13 @@ var (
"split backup data on timestamps when writing revision history",
true,
)

sendExportRequestWithVerboseTracing = settings.RegisterBoolSetting(
settings.TenantWritable,
"bulkio.backup.export_request_verbose_tracing",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How come this name was validated by the test TestLintClusterSettingNames in pkg/sql/show_test.go? It's not valid per that linter. Ditto the _delay one above.

"send each export request with a verbose tracing span",
util.ConstantWithMetamorphicTestBool("export_request_verbose_tracing", false),
)
)

const (
Expand Down Expand Up @@ -378,7 +387,7 @@ func runBackupProcessor(
// We're okay with delaying this worker until then since we assume any
// other work it could pull off the queue will likely want to delay to
// a similar or later time anyway.
if delay := delayPerAttmpt.Get(&clusterSettings.SV) - timeutil.Since(span.lastTried); delay > 0 {
if delay := delayPerAttempt.Get(&clusterSettings.SV) - timeutil.Since(span.lastTried); delay > 0 {
timer.Reset(delay)
log.Infof(ctx, "waiting %s to start attempt %d of remaining spans", delay, span.attempts+1)
select {
Expand Down Expand Up @@ -427,13 +436,22 @@ func runBackupProcessor(
log.VEventf(ctx, 1, "sending ExportRequest for span %s (attempt %d, priority %s)",
span.span, span.attempts+1, header.UserPriority.String())
var rawResp kvpb.Response
var recording tracingpb.Recording
var pErr *kvpb.Error
requestSentAt := timeutil.Now()
exportRequestErr := contextutil.RunWithTimeout(ctx,
fmt.Sprintf("ExportRequest for span %s", span.span),
timeoutPerAttempt.Get(&clusterSettings.SV), func(ctx context.Context) error {
sp := tracing.SpanFromContext(ctx)
opts := make([]tracing.SpanOption, 0)
opts = append(opts, tracing.WithParent(sp))
if sendExportRequestWithVerboseTracing.Get(&clusterSettings.SV) {
opts = append(opts, tracing.WithRecording(tracingpb.RecordingVerbose))
}
ctx, exportSpan := sp.Tracer().StartSpanCtx(ctx, "backupccl.ExportRequest", opts...)
rawResp, pErr = kv.SendWrappedWithAdmission(
ctx, flowCtx.Cfg.DB.KV().NonTransactionalSender(), header, admissionHeader, req)
recording = exportSpan.FinishAndGetConfiguredRecording()
if pErr != nil {
return pErr.GoError()
}
Expand All @@ -453,6 +471,9 @@ func runBackupProcessor(
// TimeoutError improves the opaque `context deadline exceeded` error
// message so use that instead.
if errors.HasType(exportRequestErr, (*contextutil.TimeoutError)(nil)) {
if recording != nil {
log.Errorf(ctx, "failed export request for span %s\n trace:\n%s", span.span, recording)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here and below: unnecessary space before trace.

}
return errors.Wrap(exportRequestErr, "export request timeout")
}
// BatchTimestampBeforeGCError is returned if the ExportRequest
Expand All @@ -467,6 +488,10 @@ func runBackupProcessor(
continue
}
}

if recording != nil {
log.Errorf(ctx, "failed export request %s\n trace:\n%s", span.span, recording)
}
return errors.Wrapf(exportRequestErr, "exporting %s", span.span)
}

Expand Down
9 changes: 9 additions & 0 deletions pkg/kv/kvserver/batcheval/cmd_export.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,15 @@ func evalExport(
return result.Result{}, maybeAnnotateExceedMaxSizeError(err)
}
}

// Check if our ctx has been cancelled while we were constructing the SST.
// If it has been cancelled the client is no longer expecting a response.
select {
case <-ctx.Done():
return result.Result{}, ctx.Err()
default:
}

data := destFile.Bytes()

// NB: This should only happen in two cases:
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,10 @@ func SendWrappedWith(
return SendWrappedWithAdmission(ctx, sender, h, kvpb.AdmissionHeader{}, args)
}

// SendWrappedWithAdmission is a convenience function which wraps the request
// in a batch and sends it via the provided Sender and headers. It returns the
// unwrapped response or an error. It's valid to pass a `nil` context; an
// empty one is used in that case.
// SendWrappedWithAdmission is a convenience function which wraps the request in
// a batch and sends it via the provided Sender and headers. It returns the
// unwrapped response or an error. It's valid to pass a `nil` context; an empty
// one is used in that case.
func SendWrappedWithAdmission(
ctx context.Context, sender Sender, h kvpb.Header, ah kvpb.AdmissionHeader, args kvpb.Request,
) (kvpb.Response, *kvpb.Error) {
Expand Down
11 changes: 11 additions & 0 deletions pkg/server/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1218,6 +1218,17 @@ func (n *Node) batchInternal(
tracing.SpanFromContext(ctx).MaybeRecordStackHistory(tStart)
}

// If the sender cancelled the context they may not wait around for the
// replica to notice the cancellation and return a response. For this reason,
// we log the server-side trace of the cancelled request to help debug what
// the request was doing at the time it noticed the cancellation.
if pErr != nil && errors.IsAny(pErr.GoError(), context.Canceled, context.DeadlineExceeded) {
if sp := tracing.SpanFromContext(ctx); sp != nil && !sp.IsNoop() {
log.Infof(ctx, "batch request %s failed with error: %s\ntrace:\n%s", args.String(),
irfansharif marked this conversation as resolved.
Show resolved Hide resolved
pErr.GoError().Error(), sp.GetConfiguredRecording().String())
}
}

n.metrics.callComplete(timeutil.Since(tStart), pErr)
br.Error = pErr

Expand Down
12 changes: 10 additions & 2 deletions pkg/sql/sem/builtins/fingerprint_builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
"github.com/cockroachdb/errors"
)

Expand Down Expand Up @@ -90,19 +92,25 @@ func fingerprint(
ExportFingerprint: true,
FingerprintOptions: kvpb.FingerprintOptions{StripIndexPrefixAndTimestamp: stripped}}
var rawResp kvpb.Response
var recording tracingpb.Recording
var pErr *kvpb.Error
exportRequestErr := contextutil.RunWithTimeout(ctx,
fmt.Sprintf("ExportRequest fingerprint for span %s", roachpb.Span{Key: span.Key,
EndKey: span.EndKey}),
5*time.Minute, func(ctx context.Context) error {
rawResp, pErr = kv.SendWrappedWithAdmission(ctx,
evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req)
sp := tracing.SpanFromContext(ctx)
ctx, exportSpan := sp.Tracer().StartSpanCtx(ctx, "fingerprint.ExportRequest", tracing.WithParent(sp))
rawResp, pErr = kv.SendWrappedWithAdmission(ctx, evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req)
recording = exportSpan.FinishAndGetConfiguredRecording()
if pErr != nil {
return pErr.GoError()
}
return nil
})
if exportRequestErr != nil {
if recording != nil {
log.Errorf(ctx, "failed export request trace:\n%s", recording)
}
return nil, exportRequestErr
}

Expand Down