Skip to content

Commit

Permalink
tracing: propagate non-recording spans across rpc boundaries
Browse files Browse the repository at this point in the history
We were previously propagating RPCs across boundaries only when they
were verbose. We now propagate any span (except the noop span)
regardless of current verbosity.
This ensures that SQL (which always creates real, non-verbose spans by
default) can trace the entirety of its operations.

This enables the background collection of contention metadata in 21.1
and more generally, the idea of always-on tracing. Note that we had
previously considered always-on tracing to mean avoiding the noop span
altogether; due to memory retention issues[1][2] encountered we are
taking a step back from that idea for the 21.1 release.

Somehow creating additional trace spans seems to have fouled up some
tracing-based tests. I can't boil that ocean here, so I filed a
follow-up issue: cockroachdb#60672

[1]: cockroachdb#59370
[2]: cockroachdb#59431

Release note: None
  • Loading branch information
tbg committed Feb 17, 2021
1 parent 34d19cb commit cd7f204
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 36 deletions.
28 changes: 28 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 CPut, 1 EndTxn to (n1,s1):1

# TODO(radu): allow non-side-effecting projections.
Expand All @@ -133,6 +134,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1

# Insert with RETURNING statement with side-effects should not auto-commit.
Expand All @@ -157,6 +159,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1

# Another way to test the scenario above: generate an error and ensure that the
Expand Down Expand Up @@ -192,6 +195,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 1 Put, 1 EndTxn to (n1,s1):1

# Multi-row upsert should auto-commit.
Expand All @@ -213,6 +217,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1

# No auto-commit inside a transaction.
Expand Down Expand Up @@ -261,6 +266,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1

# TODO(radu): allow non-side-effecting projections.
Expand All @@ -284,6 +290,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1

# Upsert with RETURNING statement with side-effects should not auto-commit.
Expand All @@ -308,6 +315,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1

# Another way to test the scenario above: generate an error and ensure that the
Expand Down Expand Up @@ -343,6 +351,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1

Expand Down Expand Up @@ -394,6 +403,8 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1

# TODO(radu): allow non-side-effecting projections.
Expand All @@ -418,6 +429,8 @@ WHERE message LIKE '%r$rangeid: sending batch%'
----
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1

# Update with RETURNING statement with side-effects should not auto-commit.
Expand All @@ -443,6 +456,8 @@ WHERE message LIKE '%r$rangeid: sending batch%'
----
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1

# Another way to test the scenario above: generate an error and ensure that the
Expand Down Expand Up @@ -478,6 +493,8 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 DelRng, 1 EndTxn to (n1,s1):1

# Multi-row delete should auto-commit.
Expand All @@ -499,6 +516,8 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Put to (n1,s1):1
dist sender send r36: sending batch 1 DelRng, 1 EndTxn to (n1,s1):1

# No auto-commit inside a transaction.
Expand Down Expand Up @@ -547,6 +566,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%PushTxn%'
AND message NOT LIKE '%QueryTxn%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Del, 1 EndTxn to (n1,s1):1

Expand All @@ -570,6 +590,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Del to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1
Expand All @@ -595,6 +616,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 2 Del to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1
Expand Down Expand Up @@ -644,6 +666,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 Scan to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1
Expand All @@ -667,6 +690,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 1 Put to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
Expand All @@ -692,6 +716,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 1 Del to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
Expand Down Expand Up @@ -720,6 +745,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 1 Scan to (n1,s1):1
dist sender send r36: sending batch 1 Del, 1 EndTxn to (n1,s1):1

Expand Down Expand Up @@ -749,6 +775,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1
Expand All @@ -774,6 +801,7 @@ WHERE message LIKE '%r$rangeid: sending batch%'
AND message NOT LIKE '%QueryTxn%'
AND operation NOT LIKE '%async%'
----
dist sender send r36: sending batch 1 DelRng to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 2 CPut to (n1,s1):1
dist sender send r36: sending batch 1 EndTxn to (n1,s1):1
11 changes: 2 additions & 9 deletions pkg/util/tracing/crdbspan.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,9 @@ func (s *crdbSpan) disableRecording() {

func (s *crdbSpan) getRecording(m mode) Recording {
if s == nil {
return nil
}
if m == modeLegacy && s.recordingType() == RecordingOff {
// In legacy tracing (pre always-on), we avoid allocations when the
// Span is not actively recording.
//
// TODO(tbg): we could consider doing the same when background tracing
// is on but the current span contains "nothing of interest".
return nil
return nil // noop span
}

s.mu.Lock()
// The capacity here is approximate since we don't know how many grandchildren
// there are.
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/tracing/grpc_interceptor.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func spanInclusionFuncForServer(t *Tracer, spanMeta *SpanMeta) bool {
// configured to always trace, return true. The second part is particularly
// useful for calls coming through the HTTP->RPC gateway (i.e. the AdminUI),
// where client is never tracing.
return !spanMeta.isNilOrNoop() || t.AlwaysTrace()
return spanMeta != nil || t.AlwaysTrace()
}

// setSpanTags sets one or more tags on the given span according to the
Expand Down
7 changes: 0 additions & 7 deletions pkg/util/tracing/span.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,6 @@ func (s *Span) IsBlackHole() bool {
return s.crdb.recordingType() == RecordingOff && s.netTr == nil && s.ot == (otSpan{})
}

// isNilOrNoop returns true if the Span context is either nil
// or corresponds to a "no-op" Span. If this is true, any Span
// derived from this context will be a "black hole Span".
func (sm *SpanMeta) isNilOrNoop() bool {
return sm == nil || (sm.recordingType == RecordingOff && sm.shadowTracerType == "")
}

// SpanStats are stats that can be added to a Span.
type SpanStats interface {
protoutil.Message
Expand Down
1 change: 0 additions & 1 deletion pkg/util/tracing/span_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ Span child:

func TestSpanRecordStructured(t *testing.T) {
tr := NewTracer()
tr._mode = int32(modeBackground)
sp := tr.StartSpan("root", WithForceRealSpan())
defer sp.Finish()

Expand Down
16 changes: 7 additions & 9 deletions pkg/util/tracing/tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,14 +293,12 @@ func (t *Tracer) startSpanGeneric(
opts.LogTags = logtags.FromContext(ctx)
}

// Avoid creating a real span when possible. If tracing is globally
// enabled, we always need to create spans. If the incoming
// span is recording (which implies that there is a parent) then
// we also have to create a real child. Additionally, if the
// caller explicitly asked for a real span they need to get one.
// In all other cases, a noop span will do.
// Avoid creating a real span when possible. If tracing is globally enabled,
// we always need to create spans. If the incoming span has a parent, then we
// also create a real child. Additionally, if the caller explicitly asked for
// a real span they need to get one. In all other cases, a noop span will do.
if !t.AlwaysTrace() &&
opts.recordingType() == RecordingOff &&
opts.parentTraceID() == 0 &&
!opts.ForceRealSpan {
return maybeWrapCtx(ctx, nil /* octx */, t.noopSpan)
}
Expand Down Expand Up @@ -532,7 +530,7 @@ func (fn textMapWriterFn) Set(key, val string) {
// Carrier. This, alongside ExtractMetaFrom, can be used to carry span metadata
// across process boundaries. See serializationFormat for more details.
func (t *Tracer) InjectMetaInto(sm *SpanMeta, carrier Carrier) error {
if sm.isNilOrNoop() {
if sm == nil {
// Fast path when tracing is disabled. ExtractMetaFrom will accept an
// empty map as a noop context.
return nil
Expand Down Expand Up @@ -575,7 +573,7 @@ func (t *Tracer) InjectMetaInto(sm *SpanMeta, carrier Carrier) error {
return nil
}

var noopSpanMeta = &SpanMeta{}
var noopSpanMeta = (*SpanMeta)(nil)

// ExtractMetaFrom is used to deserialize a span metadata (if any) from the
// given Carrier. This, alongside InjectMetaFrom, can be used to carry span
Expand Down
64 changes: 55 additions & 9 deletions pkg/util/tracing/tracer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ func TestTracerRecording(t *testing.T) {
}
noop1.Record("hello")

// Noop span returns empty recording.
require.Equal(t, Recording(nil), noop1.GetRecording())

noop2 := tr.StartSpan("noop2", WithParentAndManualCollection(noop1.Meta()))
if !noop2.isNoop() {
t.Error("expected noop child Span")
Expand All @@ -61,12 +64,20 @@ func TestTracerRecording(t *testing.T) {
t.Error("WithForceRealSpan Span should be black hole")
}

// Unless recording is actually started, child spans are still noop.
noop3 := tr.StartSpan("noop3", WithParentAndManualCollection(s1.Meta()))
if !noop3.isNoop() {
t.Error("expected noop child Span")
// Initial recording of this fresh (real) span.
if err := TestingCheckRecordedSpans(s1.GetRecording(), `
Span a:
tags: _unfinished=1
`); err != nil {
t.Fatal(err)
}
noop3.Finish()

// Real parent --> real child.
real3 := tr.StartSpan("noop3", WithParentAndManualCollection(s1.Meta()))
if real3.isNoop() {
t.Error("expected real child Span")
}
real3.Finish()

s1.Recordf("x=%d", 1)
s1.SetVerbose(true)
Expand Down Expand Up @@ -116,7 +127,7 @@ func TestTracerRecording(t *testing.T) {
t.Fatal(err)
}
s3.Finish()
if err := TestingCheckRecordedSpans(s1.GetRecording(), `
expS1v := `
Span a:
tags: _unfinished=1 _verbose=1
event: x=2
Expand All @@ -126,12 +137,26 @@ func TestTracerRecording(t *testing.T) {
Span c:
tags: _verbose=1 tag=val
event: x=4
`); err != nil {
`
if err := TestingCheckRecordedSpans(s1.GetRecording(), expS1v); err != nil {
t.Fatal(err)
}
// When we turn off verbosity and add more verbose logs, they get
// dropped, i.e. the recording is unchanged mod the _verbose flag.
expS1n := `
Span a:
tags: _unfinished=1
event: x=2
Span b:
tags: _verbose=1
event: x=3
Span c:
tags: _verbose=1 tag=val
event: x=4
`
s1.SetVerbose(false)
s1.Recordf("x=%d", 100)
if err := TestingCheckRecordedSpans(s1.GetRecording(), ``); err != nil {
if err := TestingCheckRecordedSpans(s1.GetRecording(), expS1n); err != nil {
t.Fatal(err)
}

Expand Down Expand Up @@ -223,7 +248,7 @@ func TestTracerInjectExtract(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if !wireSpanMeta.isNilOrNoop() {
if wireSpanMeta != nil {
t.Errorf("expected noop context: %v", wireSpanMeta)
}
noop2 := tr2.StartSpan("remote op", WithParentAndManualCollection(wireSpanMeta))
Expand Down Expand Up @@ -292,6 +317,27 @@ func TestTracerInjectExtract(t *testing.T) {
}
}

func TestTracer_PropagateNonRecordingRealSpanAcrossRPCBoundaries(t *testing.T) {
// Verify that when a span is put on the wire on one end, and is checked
// against the span inclusion functions both on the client and server, a real
// span results in a real span.
tr1 := NewTracer()
sp1 := tr1.StartSpan("tr1.root", WithForceRealSpan())
defer sp1.Finish()
carrier := metadataCarrier{MD: metadata.MD{}}
require.True(t, spanInclusionFuncForClient(sp1))
require.NoError(t, tr1.InjectMetaInto(sp1.Meta(), carrier))
require.Equal(t, 2, carrier.Len(), "%+v", carrier) // trace id and span id

tr2 := NewTracer()
meta, err := tr2.ExtractMetaFrom(carrier)
require.NoError(t, err)
require.True(t, spanInclusionFuncForServer(tr2, meta))
sp2 := tr2.StartSpan("tr2.child", WithParentAndManualCollection(meta))
defer sp2.Finish()
require.NotZero(t, sp2.crdb.spanID)
}

func TestLightstepContext(t *testing.T) {
tr := NewTracer()
lsTr := lightstep.NewTracer(lightstep.Options{
Expand Down

0 comments on commit cd7f204

Please sign in to comment.