diff --git a/pkg/ccl/backupccl/BUILD.bazel b/pkg/ccl/backupccl/BUILD.bazel index ccfbca9627fd..a8c350e5e683 100644 --- a/pkg/ccl/backupccl/BUILD.bazel +++ b/pkg/ccl/backupccl/BUILD.bazel @@ -148,6 +148,7 @@ go_library( "//pkg/util/syncutil", "//pkg/util/timeutil", "//pkg/util/tracing", + "//pkg/util/tracing/tracingpb", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_logtags//:logtags", diff --git a/pkg/ccl/backupccl/backup_processor.go b/pkg/ccl/backupccl/backup_processor.go index 35d4cdc5fc86..58e93695efc5 100644 --- a/pkg/ccl/backupccl/backup_processor.go +++ b/pkg/ccl/backupccl/backup_processor.go @@ -30,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/rowexec" "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" "github.com/cockroachdb/cockroach/pkg/util/bulk" "github.com/cockroachdb/cockroach/pkg/util/contextutil" @@ -40,6 +41,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/stop" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" "github.com/cockroachdb/errors" "github.com/cockroachdb/logtags" "github.com/cockroachdb/redact" @@ -55,7 +57,7 @@ var ( time.Minute, settings.NonNegativeDuration, ).WithPublic() - delayPerAttmpt = settings.RegisterDurationSetting( + delayPerAttempt = settings.RegisterDurationSetting( settings.TenantWritable, "bulkio.backup.read_retry_delay", "amount of time since the read-as-of time, per-prior attempt, to wait before making another attempt", @@ -82,6 +84,13 @@ var ( "split backup data on timestamps when writing revision history", true, ) + + sendExportRequestWithVerboseTracing = settings.RegisterBoolSetting( + settings.TenantWritable, + "bulkio.backup.export_request_verbose_tracing", + "send each export request with a verbose tracing span", + util.ConstantWithMetamorphicTestBool("export_request_verbose_tracing", false), + ) ) const ( @@ -378,7 +387,7 @@ func runBackupProcessor( // We're okay with delaying this worker until then since we assume any // other work it could pull off the queue will likely want to delay to // a similar or later time anyway. - if delay := delayPerAttmpt.Get(&clusterSettings.SV) - timeutil.Since(span.lastTried); delay > 0 { + if delay := delayPerAttempt.Get(&clusterSettings.SV) - timeutil.Since(span.lastTried); delay > 0 { timer.Reset(delay) log.Infof(ctx, "waiting %s to start attempt %d of remaining spans", delay, span.attempts+1) select { @@ -427,11 +436,22 @@ func runBackupProcessor( log.VEventf(ctx, 1, "sending ExportRequest for span %s (attempt %d, priority %s)", span.span, span.attempts+1, header.UserPriority.String()) var rawResp kvpb.Response + var recording tracingpb.Recording var pErr *kvpb.Error requestSentAt := timeutil.Now() exportRequestErr := contextutil.RunWithTimeout(ctx, fmt.Sprintf("ExportRequest for span %s", span.span), timeoutPerAttempt.Get(&clusterSettings.SV), func(ctx context.Context) error { + sp := tracing.SpanFromContext(ctx) + opts := make([]tracing.SpanOption, 0) + opts = append(opts, tracing.WithParent(sp)) + if sendExportRequestWithVerboseTracing.Get(&clusterSettings.SV) { + opts = append(opts, tracing.WithRecording(tracingpb.RecordingVerbose)) + } + ctx, exportSpan := sp.Tracer().StartSpanCtx(ctx, "backupccl.ExportRequest", opts...) + defer func() { + recording = exportSpan.FinishAndGetConfiguredRecording() + }() rawResp, pErr = kv.SendWrappedWithAdmission( ctx, flowCtx.Cfg.DB.KV().NonTransactionalSender(), header, admissionHeader, req) if pErr != nil { @@ -453,6 +473,9 @@ func runBackupProcessor( // TimeoutError improves the opaque `context deadline exceeded` error // message so use that instead. if errors.HasType(exportRequestErr, (*contextutil.TimeoutError)(nil)) { + if recording != nil { + log.Errorf(ctx, "failed export request for span %s\n trace:\n%s", span.span, recording) + } return errors.Wrap(exportRequestErr, "export request timeout") } // BatchTimestampBeforeGCError is returned if the ExportRequest @@ -467,6 +490,10 @@ func runBackupProcessor( continue } } + + if recording != nil { + log.Errorf(ctx, "failed export request %s\n trace:\n%s", span.span, recording) + } return errors.Wrapf(exportRequestErr, "exporting %s", span.span) } diff --git a/pkg/kv/kvserver/batcheval/cmd_export.go b/pkg/kv/kvserver/batcheval/cmd_export.go index a435e4e3d941..4f1becfa7515 100644 --- a/pkg/kv/kvserver/batcheval/cmd_export.go +++ b/pkg/kv/kvserver/batcheval/cmd_export.go @@ -233,6 +233,15 @@ func evalExport( return result.Result{}, maybeAnnotateExceedMaxSizeError(err) } } + + // Check if our ctx has been cancelled while we were constructing the SST. + // If it has been cancelled the client is no longer expecting a response. + select { + case <-ctx.Done(): + return result.Result{}, ctx.Err() + default: + } + data := destFile.Bytes() // NB: This should only happen in two cases: diff --git a/pkg/kv/sender.go b/pkg/kv/sender.go index 0d6a01871101..cd2614194d92 100644 --- a/pkg/kv/sender.go +++ b/pkg/kv/sender.go @@ -441,10 +441,10 @@ func SendWrappedWith( return SendWrappedWithAdmission(ctx, sender, h, kvpb.AdmissionHeader{}, args) } -// SendWrappedWithAdmission is a convenience function which wraps the request -// in a batch and sends it via the provided Sender and headers. It returns the -// unwrapped response or an error. It's valid to pass a `nil` context; an -// empty one is used in that case. +// SendWrappedWithAdmission is a convenience function which wraps the request in +// a batch and sends it via the provided Sender and headers. It returns the +// unwrapped response or an error. It's valid to pass a `nil` context; an empty +// one is used in that case. func SendWrappedWithAdmission( ctx context.Context, sender Sender, h kvpb.Header, ah kvpb.AdmissionHeader, args kvpb.Request, ) (kvpb.Response, *kvpb.Error) { diff --git a/pkg/server/node.go b/pkg/server/node.go index 6bdf3869847d..70a220081092 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -1219,6 +1219,17 @@ func (n *Node) batchInternal( tracing.SpanFromContext(ctx).MaybeRecordStackHistory(tStart) } + // If the sender cancelled the context they may not wait around for the + // replica to notice the cancellation and return a response. For this reason, + // we log the server-side trace of the cancelled request to help debug what + // the request was doing at the time it noticed the cancellation. + if pErr != nil && errors.IsAny(pErr.GoError(), context.Canceled, context.DeadlineExceeded) { + if sp := tracing.SpanFromContext(ctx); sp != nil && !sp.IsNoop() { + log.Infof(ctx, "batch request %s failed with error: %s\ntrace:\n%s", args.String(), + pErr.GoError().Error(), sp.GetConfiguredRecording().String()) + } + } + n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr diff --git a/pkg/sql/sem/builtins/fingerprint_builtins.go b/pkg/sql/sem/builtins/fingerprint_builtins.go index e6d6d6e2add0..baaf97e0bae7 100644 --- a/pkg/sql/sem/builtins/fingerprint_builtins.go +++ b/pkg/sql/sem/builtins/fingerprint_builtins.go @@ -26,8 +26,10 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" "github.com/cockroachdb/cockroach/pkg/util/contextutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" "github.com/cockroachdb/errors" ) @@ -134,19 +136,27 @@ func fingerprint( ExportFingerprint: true, FingerprintOptions: kvpb.FingerprintOptions{StripIndexPrefixAndTimestamp: stripped}} var rawResp kvpb.Response + var recording tracingpb.Recording var pErr *kvpb.Error exportRequestErr := contextutil.RunWithTimeout(ctx, fmt.Sprintf("ExportRequest fingerprint for span %s", roachpb.Span{Key: span.Key, EndKey: span.EndKey}), 5*time.Minute, func(ctx context.Context) error { - rawResp, pErr = kv.SendWrappedWithAdmission(ctx, - evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req) + sp := tracing.SpanFromContext(ctx) + ctx, exportSpan := sp.Tracer().StartSpanCtx(ctx, "fingerprint.ExportRequest", tracing.WithParent(sp)) + defer func() { + recording = exportSpan.FinishAndGetConfiguredRecording() + }() + rawResp, pErr = kv.SendWrappedWithAdmission(ctx, evalCtx.Txn.DB().NonTransactionalSender(), header, admissionHeader, req) if pErr != nil { return pErr.GoError() } return nil }) if exportRequestErr != nil { + if recording != nil { + log.Errorf(ctx, "failed export request trace:\n%s", recording) + } return nil, exportRequestErr }