Skip to content

Commit

Permalink
tracing: enable always-on tracing by default
Browse files Browse the repository at this point in the history
This is follow-up work from #58712, where we measured the overhead for
always-on tracing and found it to be minimal/acceptable. Lets switch
this on by default to shake the implications of doing so. We can
reasonably expect two kinds of fallout:

(1) Unexpected blow up in memory usage due to resource leakage (which is
a can be problem now that we're always maintaining open spans in an
internal registry, see #58721)

(2) Performance degradataion due to tracing overhead per-request
(something #58712) was spot checking for.

For (1) we'll introduce a future test in a separate PR. For (2), we'll
monitor roachperf over the next few weeks.

---

Also moved some of the documentation for the cluster setting into a
comment form above. Looking at what's rendered in our other cluster
settings (`SHOW ALL CLUSTER SETTINGS`), we default to a very pithy,
unwrapped description.

Release note: None
  • Loading branch information
irfansharif committed Jan 18, 2021
1 parent 5f97570 commit 17ea476
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 28 deletions.
3 changes: 1 addition & 2 deletions pkg/sql/distsql_running.go
Original file line number Diff line number Diff line change
Expand Up @@ -644,8 +644,7 @@ func (r *DistSQLReceiver) Push(
if len(meta.TraceData) > 0 {
span := tracing.SpanFromContext(r.ctx)
if span == nil {
r.resultWriter.SetError(
errors.New("trying to ingest remote spans but there is no recording span set up"))
// Nothing to do.
} else if err := span.ImportRemoteSpans(meta.TraceData); err != nil {
r.resultWriter.SetError(errors.Errorf("error ingesting remote spans: %s", err))
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/execinfra/processorsbase.go
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ const (
// returned from now on. In this state, the processor is expected to drain its
// inputs (commonly by using DrainHelper()).
//
// If the processor has no input (ProcStateOpts.intputToDrain was not specified
// If the processor has no input (ProcStateOpts.inputToDrain was not specified
// at init() time), then we move straight to the StateTrailingMeta.
//
// An error can be optionally passed. It will be the first piece of metadata
Expand Down
15 changes: 15 additions & 0 deletions pkg/sql/flowinfra/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ func TestClusterFlow(t *testing.T) {
metas = ignoreMisplannedRanges(metas)
metas = ignoreLeafTxnState(metas)
metas = ignoreMetricsMeta(metas)
metas = ignoreTraceData(metas)
if len(metas) != 0 {
t.Fatalf("unexpected metadata (%d): %+v", len(metas), metas)
}
Expand Down Expand Up @@ -327,6 +328,18 @@ func ignoreMetricsMeta(metas []execinfrapb.ProducerMetadata) []execinfrapb.Produ
return res
}

// ignoreTraceData takes a slice of metadata and returns the entries
// excluding the ones with trace data.
func ignoreTraceData(metas []execinfrapb.ProducerMetadata) []execinfrapb.ProducerMetadata {
res := make([]execinfrapb.ProducerMetadata, 0)
for _, m := range metas {
if m.TraceData == nil {
res = append(res, m)
}
}
return res
}

// TestLimitedBufferingDeadlock sets up a scenario which leads to deadlock if
// a single consumer can block the entire router (#17097).
func TestLimitedBufferingDeadlock(t *testing.T) {
Expand Down Expand Up @@ -545,6 +558,7 @@ func TestLimitedBufferingDeadlock(t *testing.T) {
metas = ignoreMisplannedRanges(metas)
metas = ignoreLeafTxnState(metas)
metas = ignoreMetricsMeta(metas)
metas = ignoreTraceData(metas)
if len(metas) != 0 {
t.Errorf("unexpected metadata (%d): %+v", len(metas), metas)
}
Expand Down Expand Up @@ -852,6 +866,7 @@ func BenchmarkInfrastructure(b *testing.B) {
metas = ignoreMisplannedRanges(metas)
metas = ignoreLeafTxnState(metas)
metas = ignoreMetricsMeta(metas)
metas = ignoreTraceData(metas)
if len(metas) != 0 {
b.Fatalf("unexpected metadata (%d): %+v", len(metas), metas)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/flowinfra/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ func TestServer(t *testing.T) {
}
metas = ignoreLeafTxnState(metas)
metas = ignoreMetricsMeta(metas)
metas = ignoreTraceData(metas)
if len(metas) != 0 {
t.Errorf("unexpected metadata: %v", metas)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/physicalplan/aggregator_funcs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func runTestFlow(
for {
row, meta := rowBuf.Next()
if meta != nil {
if meta.LeafTxnFinalState != nil || meta.Metrics != nil {
if meta.LeafTxnFinalState != nil || meta.Metrics != nil || meta.TraceData != nil {
continue
}
t.Fatalf("unexpected metadata: %v", meta)
Expand Down
46 changes: 22 additions & 24 deletions pkg/util/tracing/tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,32 +62,30 @@ const (
modeBackground
)

// tracingMode informs the creation of noop spans
// and the default recording mode of created spans.
// tracingMode informs the creation of noop spans and the default recording mode
// of created spans.
//
// If set to 'background', trace spans will be created for all operations, but
// these will record sparse structured information, unless an operation
// explicitly requests the verbose from. It's optimized for low overhead, and
// powers fine-grained statistics and alerts.
//
// If set to 'legacy', trace spans will not be created by default. This is
// unless an internal code path explicitly requests for it, or if an auxiliary
// tracer (such as lightstep or zipkin) is configured. This tracing mode always
// records in the verbose form. Using this mode has two effects: the
// observability of the cluster may be degraded (as most trace spans are elided)
// and where trace spans are created, they may consume large amounts of memory.
//
// Note that regardless of this setting, configuring an auxiliary trace sink
// will cause verbose traces to be created for all operations, which may lead to
// high memory consumption. It is not currently possible to send non-verbose
// traces to auxiliary sinks.
var tracingMode = settings.RegisterEnumSetting(
"trace.mode",
`configures the CockroachDB-internal tracing subsystem.
If set to 'background', trace spans will be created for all operations, but
these trace spans will only be recording sparse structured information,
unless an operation explicitly requests verbose recording. This is
optimized for low overhead, and powers fine-grained statistics and alerts.
If set to 'legacy', trace spans will not be created (unless an
auxiliary tracer such as Lightstep or Zipkin, is configured, or an
internal code path explicitly requests a trace to be created) but
when they are, they record verbose information. This has two effects:
the observability of the cluster may be degraded (as some trace spans
are elided) and where trace spans are created, they may consume large
amounts of memory. This mode should not be used with auxiliary tracing
sinks as that leads to expensive trace spans being created throughout.
Note that regardless of this setting, configuring an auxiliary
trace sink will cause verbose traces to be created for all
operations, which may lead to high memory consumption. It is not
currently possible to send non-verbose traces to auxiliary sinks.
`,
"legacy",
"if set to 'background', traces will be created for all operations (in"+
"'legacy' mode it's created when explicitly requested or when auxiliary tracers are configured)",
"background",
map[int64]string{
int64(modeLegacy): "legacy",
int64(modeBackground): "background",
Expand Down

0 comments on commit 17ea476

Please sign in to comment.