From 7fa84318b3e90002ede9f9b3ec82bcc5198dbb2e Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 8 Feb 2021 10:19:31 +0100 Subject: [PATCH] tracing: propagate non-recording spans across rpc boundaries We were previously propagating RPCs across boundaries only when they were verbose. We now propagate any span (except the noop span) regardless of current verbosity. This ensures that SQL (which always creates real, non-verbose spans by default) can trace the entirety of its operations. This enables the background collection of contention metadata in 21.1 and more generally, the idea of always-on tracing. Note that we had previously considered always-on tracing to mean avoiding the noop span altogether; due to memory retention issues[1][2] encountered we are taking a step back from that idea for the 21.1 release. Somehow creating additional trace spans seems to have fouled up some tracing-based tests. I can't boil that ocean here, so I filed a follow-up issue: #60672 [1]: https://github.com/cockroachdb/cockroach/pull/59370 [2]: https://github.com/cockroachdb/cockroach/pull/59431 Release note: None --- .../testdata/autocommit_nonmetamorphic | 28 ++++++++ pkg/util/tracing/crdbspan.go | 11 +--- pkg/util/tracing/grpc_interceptor.go | 2 +- pkg/util/tracing/span.go | 7 -- pkg/util/tracing/span_test.go | 1 - pkg/util/tracing/tracer.go | 16 ++--- pkg/util/tracing/tracer_test.go | 64 ++++++++++++++++--- 7 files changed, 93 insertions(+), 36 deletions(-) diff --git a/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic b/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic index 7191149f250a..0a016b360620 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic +++ b/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic @@ -110,6 +110,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 CPut, 1 EndTxn to (n1,s1):1 # TODO(radu): allow non-side-effecting projections. @@ -133,6 +134,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 CPut to (n1,s1):1 +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Insert with RETURNING statement with side-effects should not auto-commit. @@ -157,6 +159,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 CPut to (n1,s1):1 +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Another way to test the scenario above: generate an error and ensure that the @@ -192,6 +195,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 Put, 1 EndTxn to (n1,s1):1 # Multi-row upsert should auto-commit. @@ -213,6 +217,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 # No auto-commit inside a transaction. @@ -261,6 +266,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 # TODO(radu): allow non-side-effecting projections. @@ -284,6 +290,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Upsert with RETURNING statement with side-effects should not auto-commit. @@ -308,6 +315,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Another way to test the scenario above: generate an error and ensure that the @@ -343,6 +351,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 @@ -394,6 +403,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' ---- dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 # TODO(radu): allow non-side-effecting projections. @@ -418,6 +429,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' ---- dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Update with RETURNING statement with side-effects should not auto-commit. @@ -443,6 +456,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' ---- dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Another way to test the scenario above: generate an error and ensure that the @@ -478,6 +493,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 DelRng, 1 EndTxn to (n1,s1):1 # Multi-row delete should auto-commit. @@ -499,6 +516,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 DelRng, 1 EndTxn to (n1,s1):1 # No auto-commit inside a transaction. @@ -547,6 +566,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Del, 1 EndTxn to (n1,s1):1 @@ -570,6 +590,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Del to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -595,6 +616,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Del to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -644,6 +666,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 Scan to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -667,6 +690,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 1 Put to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 @@ -692,6 +716,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 1 Del to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 @@ -720,6 +745,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 1 DelRng to (n1,s1):1 +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 1 Del, 1 EndTxn to (n1,s1):1 @@ -749,6 +775,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -774,6 +801,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 diff --git a/pkg/util/tracing/crdbspan.go b/pkg/util/tracing/crdbspan.go index 9be388dc021b..7fcd5aa4cf19 100644 --- a/pkg/util/tracing/crdbspan.go +++ b/pkg/util/tracing/crdbspan.go @@ -137,16 +137,9 @@ func (s *crdbSpan) disableRecording() { func (s *crdbSpan) getRecording(m mode) Recording { if s == nil { - return nil - } - if m == modeLegacy && s.recordingType() == RecordingOff { - // In legacy tracing (pre always-on), we avoid allocations when the - // Span is not actively recording. - // - // TODO(tbg): we could consider doing the same when background tracing - // is on but the current span contains "nothing of interest". - return nil + return nil // noop span } + s.mu.Lock() // The capacity here is approximate since we don't know how many grandchildren // there are. diff --git a/pkg/util/tracing/grpc_interceptor.go b/pkg/util/tracing/grpc_interceptor.go index e25393b59e74..3164a12e0ff5 100644 --- a/pkg/util/tracing/grpc_interceptor.go +++ b/pkg/util/tracing/grpc_interceptor.go @@ -73,7 +73,7 @@ func spanInclusionFuncForServer(t *Tracer, spanMeta *SpanMeta) bool { // configured to always trace, return true. The second part is particularly // useful for calls coming through the HTTP->RPC gateway (i.e. the AdminUI), // where client is never tracing. - return !spanMeta.isNilOrNoop() || t.AlwaysTrace() + return spanMeta != nil || t.AlwaysTrace() } // setSpanTags sets one or more tags on the given span according to the diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index cc99d0b3e932..d46ddeec8f9b 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -171,13 +171,6 @@ func (s *Span) IsBlackHole() bool { return s.crdb.recordingType() == RecordingOff && s.netTr == nil && s.ot == (otSpan{}) } -// isNilOrNoop returns true if the Span context is either nil -// or corresponds to a "no-op" Span. If this is true, any Span -// derived from this context will be a "black hole Span". -func (sm *SpanMeta) isNilOrNoop() bool { - return sm == nil || (sm.recordingType == RecordingOff && sm.shadowTracerType == "") -} - // SpanStats are stats that can be added to a Span. type SpanStats interface { protoutil.Message diff --git a/pkg/util/tracing/span_test.go b/pkg/util/tracing/span_test.go index c0ab81007351..bd7fea5c03cb 100644 --- a/pkg/util/tracing/span_test.go +++ b/pkg/util/tracing/span_test.go @@ -205,7 +205,6 @@ Span child: func TestSpanRecordStructured(t *testing.T) { tr := NewTracer() - tr._mode = int32(modeBackground) sp := tr.StartSpan("root", WithForceRealSpan()) defer sp.Finish() diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index ed27ef33f416..77e37f9df311 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -292,14 +292,12 @@ func (t *Tracer) startSpanGeneric( opts.LogTags = logtags.FromContext(ctx) } - // Avoid creating a real span when possible. If tracing is globally - // enabled, we always need to create spans. If the incoming - // span is recording (which implies that there is a parent) then - // we also have to create a real child. Additionally, if the - // caller explicitly asked for a real span they need to get one. - // In all other cases, a noop span will do. + // Avoid creating a real span when possible. If tracing is globally enabled, + // we always need to create spans. If the incoming span has a parent, then we + // also create a real child. Additionally, if the caller explicitly asked for + // a real span they need to get one. In all other cases, a noop span will do. if !t.AlwaysTrace() && - opts.recordingType() == RecordingOff && + opts.parentTraceID() == 0 && !opts.ForceRealSpan { return maybeWrapCtx(ctx, nil /* octx */, t.noopSpan) } @@ -531,7 +529,7 @@ func (fn textMapWriterFn) Set(key, val string) { // Carrier. This, alongside ExtractMetaFrom, can be used to carry span metadata // across process boundaries. See serializationFormat for more details. func (t *Tracer) InjectMetaInto(sm *SpanMeta, carrier Carrier) error { - if sm.isNilOrNoop() { + if sm == nil { // Fast path when tracing is disabled. ExtractMetaFrom will accept an // empty map as a noop context. return nil @@ -574,7 +572,7 @@ func (t *Tracer) InjectMetaInto(sm *SpanMeta, carrier Carrier) error { return nil } -var noopSpanMeta = &SpanMeta{} +var noopSpanMeta = (*SpanMeta)(nil) // ExtractMetaFrom is used to deserialize a span metadata (if any) from the // given Carrier. This, alongside InjectMetaFrom, can be used to carry span diff --git a/pkg/util/tracing/tracer_test.go b/pkg/util/tracing/tracer_test.go index 8bc59728aeb2..c0e319c581f8 100644 --- a/pkg/util/tracing/tracer_test.go +++ b/pkg/util/tracing/tracer_test.go @@ -46,6 +46,9 @@ func TestTracerRecording(t *testing.T) { } noop1.Record("hello") + // Noop span returns empty recording. + require.Equal(t, Recording(nil), noop1.GetRecording()) + noop2 := tr.StartSpan("noop2", WithParentAndManualCollection(noop1.Meta())) if !noop2.isNoop() { t.Error("expected noop child Span") @@ -61,12 +64,20 @@ func TestTracerRecording(t *testing.T) { t.Error("WithForceRealSpan Span should be black hole") } - // Unless recording is actually started, child spans are still noop. - noop3 := tr.StartSpan("noop3", WithParentAndManualCollection(s1.Meta())) - if !noop3.isNoop() { - t.Error("expected noop child Span") + // Initial recording of this fresh (real) span. + if err := TestingCheckRecordedSpans(s1.GetRecording(), ` + Span a: + tags: _unfinished=1 + `); err != nil { + t.Fatal(err) } - noop3.Finish() + + // Real parent --> real child. + real3 := tr.StartSpan("noop3", WithParentAndManualCollection(s1.Meta())) + if real3.isNoop() { + t.Error("expected real child Span") + } + real3.Finish() s1.Recordf("x=%d", 1) s1.SetVerbose(true) @@ -116,7 +127,7 @@ func TestTracerRecording(t *testing.T) { t.Fatal(err) } s3.Finish() - if err := TestingCheckRecordedSpans(s1.GetRecording(), ` + expS1v := ` Span a: tags: _unfinished=1 _verbose=1 event: x=2 @@ -126,12 +137,26 @@ func TestTracerRecording(t *testing.T) { Span c: tags: _verbose=1 tag=val event: x=4 - `); err != nil { + ` + if err := TestingCheckRecordedSpans(s1.GetRecording(), expS1v); err != nil { t.Fatal(err) } + // When we turn off verbosity and add more verbose logs, they get + // dropped, i.e. the recording is unchanged mod the _verbose flag. + expS1n := ` + Span a: + tags: _unfinished=1 + event: x=2 + Span b: + tags: _verbose=1 + event: x=3 + Span c: + tags: _verbose=1 tag=val + event: x=4 + ` s1.SetVerbose(false) s1.Recordf("x=%d", 100) - if err := TestingCheckRecordedSpans(s1.GetRecording(), ``); err != nil { + if err := TestingCheckRecordedSpans(s1.GetRecording(), expS1n); err != nil { t.Fatal(err) } @@ -223,7 +248,7 @@ func TestTracerInjectExtract(t *testing.T) { if err != nil { t.Fatal(err) } - if !wireSpanMeta.isNilOrNoop() { + if wireSpanMeta != nil { t.Errorf("expected noop context: %v", wireSpanMeta) } noop2 := tr2.StartSpan("remote op", WithParentAndManualCollection(wireSpanMeta)) @@ -292,6 +317,27 @@ func TestTracerInjectExtract(t *testing.T) { } } +func TestTracer_PropagateNonRecordingRealSpanAcrossRPCBoundaries(t *testing.T) { + // Verify that when a span is put on the wire on one end, and is checked + // against the span inclusion functions both on the client and server, a real + // span results in a real span. + tr1 := NewTracer() + sp1 := tr1.StartSpan("tr1.root", WithForceRealSpan()) + defer sp1.Finish() + carrier := metadataCarrier{MD: metadata.MD{}} + require.True(t, spanInclusionFuncForClient(sp1)) + require.NoError(t, tr1.InjectMetaInto(sp1.Meta(), carrier)) + require.Equal(t, 2, carrier.Len(), "%+v", carrier) // trace id and span id + + tr2 := NewTracer() + meta, err := tr2.ExtractMetaFrom(carrier) + require.NoError(t, err) + require.True(t, spanInclusionFuncForServer(tr2, meta)) + sp2 := tr2.StartSpan("tr2.child", WithParentAndManualCollection(meta)) + defer sp2.Finish() + require.NotZero(t, sp2.crdb.spanID) +} + func TestLightstepContext(t *testing.T) { tr := NewTracer() lsTr := lightstep.NewTracer(lightstep.Options{