From 9a5c7ca33b0d8f1b2fd8dacb61fad05875c478cd Mon Sep 17 00:00:00 2001
From: Piotr WOLSKI <piotr.wolski42@gmail.com>
Date: Thu, 22 Jul 2021 17:33:09 -0400
Subject: [PATCH] ddtrace/tracer: Don't drop trace if some spans must be kept
 (#963)

The current behavior is to drop the trace except if all spans should be
kept.

This doesn't work if:
* a downstream span has no error, is closed (so should be dropped) but then
  the rest of the trace ends up having an error so should be kept.
* a span should be kept for app analytics (event sampling), but the trace
  is a p0 so the whole trace is dropped.

The commit inverses the behavior: Keep the trace except if all spans should
be dropped.
---
 ddtrace/tracer/span.go             | 29 +++++-------
 ddtrace/tracer/span_test.go        | 18 ++++----
 ddtrace/tracer/spancontext.go      | 71 +++++++++++++++++++++++-------
 ddtrace/tracer/spancontext_test.go | 15 ++++---
 ddtrace/tracer/tracer.go           |  2 +-
 ddtrace/tracer/tracer_test.go      | 60 +++++++++++++++++++++++++
 6 files changed, 146 insertions(+), 49 deletions(-)

diff --git a/ddtrace/tracer/span.go b/ddtrace/tracer/span.go
index f349538320..db37c2e04f 100644
--- a/ddtrace/tracer/span.go
+++ b/ddtrace/tracer/span.go
@@ -344,6 +344,7 @@ func (s *span) finish(finishTime int64) {
 	}
 	s.finished = true
 
+	keep := true
 	if t, ok := internal.GetGlobalTracer().(*tracer); ok {
 		// we have an active tracer
 		feats := t.features.Load()
@@ -358,19 +359,12 @@ func (s *span) finish(finishTime int64) {
 		}
 		if feats.DropP0s {
 			// the agent supports dropping p0's in the client
-			if shouldDrop(s) {
-				// ...and this span can be dropped
-				atomic.AddUint64(&t.droppedP0Spans, 1)
-				if s == s.context.trace.root {
-					atomic.AddUint64(&t.droppedP0Traces, 1)
-				}
-				return
-			}
+			keep = shouldKeep(s)
 		}
 	}
-	if s.context.drop {
-		// not sampled by local sampler
-		return
+	if keep {
+		// a single kept span keeps the whole trace.
+		s.context.trace.keep()
 	}
 	s.context.finish()
 }
@@ -401,20 +395,21 @@ func newAggregableSpan(s *span, cfg *config) *aggregableSpan {
 	}
 }
 
-// shouldDrop reports whether it's fine to drop the span s.
-func shouldDrop(s *span) bool {
+// shouldKeep reports whether the trace should be kept.
+// a single span being kept implies the whole trace being kept.
+func shouldKeep(s *span) bool {
 	if p, ok := s.context.samplingPriority(); ok && p > 0 {
 		// positive sampling priorities stay
-		return false
+		return true
 	}
 	if atomic.LoadInt64(&s.context.errors) > 0 {
 		// traces with any span containing an error get kept
-		return false
+		return true
 	}
 	if v, ok := s.Metrics[ext.EventSampleRate]; ok {
-		return !sampledByRate(s.TraceID, v)
+		return sampledByRate(s.TraceID, v)
 	}
-	return true
+	return false
 }
 
 // shouldComputeStats mentions whether this span needs to have stats computed for.
diff --git a/ddtrace/tracer/span_test.go b/ddtrace/tracer/span_test.go
index b9943774d9..f8d509f21a 100644
--- a/ddtrace/tracer/span_test.go
+++ b/ddtrace/tracer/span_test.go
@@ -107,26 +107,26 @@ func TestShouldDrop(t *testing.T) {
 		rate   float64
 		want   bool
 	}{
-		{1, 0, 0, false},
-		{2, 1, 0, false},
-		{0, 1, 0, false},
-		{0, 0, 1, false},
-		{0, 0, 0.5, false},
-		{0, 0, 0.00001, true},
-		{0, 0, 0, true},
+		{1, 0, 0, true},
+		{2, 1, 0, true},
+		{0, 1, 0, true},
+		{0, 0, 1, true},
+		{0, 0, 0.5, true},
+		{0, 0, 0.00001, false},
+		{0, 0, 0, false},
 	} {
 		t.Run("", func(t *testing.T) {
 			s := newSpan("", "", "", 1, 1, 0)
 			s.SetTag(ext.SamplingPriority, tt.prio)
 			s.SetTag(ext.EventSampleRate, tt.rate)
 			atomic.StoreInt64(&s.context.errors, tt.errors)
-			assert.Equal(t, shouldDrop(s), tt.want)
+			assert.Equal(t, shouldKeep(s), tt.want)
 		})
 	}
 
 	t.Run("none", func(t *testing.T) {
 		s := newSpan("", "", "", 1, 1, 0)
-		assert.Equal(t, shouldDrop(s), true)
+		assert.Equal(t, shouldKeep(s), false)
 	})
 }
 
diff --git a/ddtrace/tracer/spancontext.go b/ddtrace/tracer/spancontext.go
index 3d1cb4ddfa..d88fbd4ba0 100644
--- a/ddtrace/tracer/spancontext.go
+++ b/ddtrace/tracer/spancontext.go
@@ -10,6 +10,7 @@ import (
 	"sync/atomic"
 
 	"gopkg.in/DataDog/dd-trace-go.v1/ddtrace"
+	"gopkg.in/DataDog/dd-trace-go.v1/ddtrace/ext"
 	"gopkg.in/DataDog/dd-trace-go.v1/ddtrace/internal"
 	"gopkg.in/DataDog/dd-trace-go.v1/internal/log"
 )
@@ -25,7 +26,6 @@ type spanContext struct {
 
 	trace  *trace // reference to the trace that this span belongs too
 	span   *span  // reference to the span that hosts this context
-	drop   bool   // when true, the span will not be sent to the agent
 	errors int64  // number of spans with errors in this trace
 
 	// the below group should propagate cross-process
@@ -52,7 +52,6 @@ func newSpanContext(span *span, parent *spanContext) *spanContext {
 	}
 	if parent != nil {
 		context.trace = parent.trace
-		context.drop = parent.drop
 		context.origin = parent.origin
 		context.errors = parent.errors
 		parent.ForeachBaggageItem(func(k, v string) bool {
@@ -128,16 +127,30 @@ func (c *spanContext) baggageItem(key string) string {
 // finish marks this span as finished in the trace.
 func (c *spanContext) finish() { c.trace.finishedOne(c.span) }
 
+// samplingDecision is the decision to send a trace to the agent or not.
+type samplingDecision int64
+
+const (
+	// decisionNone is the default state of a trace.
+	// If no decision is made about the trace, the trace won't be sent to the agent.
+	decisionNone samplingDecision = iota
+	// decisionDrop prevents the trace from being sent to the agent.
+	decisionDrop
+	// decisionKeep ensures the trace will be sent to the agent.
+	decisionKeep
+)
+
 // trace contains shared context information about a trace, such as sampling
 // priority, the root reference and a buffer of the spans which are part of the
 // trace, if these exist.
 type trace struct {
-	mu       sync.RWMutex // guards below fields
-	spans    []*span      // all the spans that are part of this trace
-	finished int          // the number of finished spans
-	full     bool         // signifies that the span buffer is full
-	priority *float64     // sampling priority
-	locked   bool         // specifies if the sampling priority can be altered
+	mu               sync.RWMutex     // guards below fields
+	spans            []*span          // all the spans that are part of this trace
+	finished         int              // the number of finished spans
+	full             bool             // signifies that the span buffer is full
+	priority         *float64         // sampling priority
+	locked           bool             // specifies if the sampling priority can be altered
+	samplingDecision samplingDecision // samplingDecision indicates whether to send the trace to the agent.
 
 	// root specifies the root of the trace, if known; it is nil when a span
 	// context is extracted from a carrier, at which point there are no spans in
@@ -164,21 +177,33 @@ func newTrace() *trace {
 	return &trace{spans: make([]*span, 0, traceStartSize)}
 }
 
-func (t *trace) samplingPriority() (p int, ok bool) {
-	t.mu.RLock()
-	defer t.mu.RUnlock()
+func (t *trace) samplingPriorityLocked() (p int, ok bool) {
 	if t.priority == nil {
 		return 0, false
 	}
 	return int(*t.priority), true
 }
 
+func (t *trace) samplingPriority() (p int, ok bool) {
+	t.mu.RLock()
+	defer t.mu.RUnlock()
+	return t.samplingPriorityLocked()
+}
+
 func (t *trace) setSamplingPriority(p float64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	t.setSamplingPriorityLocked(p)
 }
 
+func (t *trace) keep() {
+	atomic.CompareAndSwapInt64((*int64)(&t.samplingDecision), int64(decisionNone), int64(decisionKeep))
+}
+
+func (t *trace) drop() {
+	atomic.CompareAndSwapInt64((*int64)(&t.samplingDecision), int64(decisionNone), int64(decisionDrop))
+}
+
 func (t *trace) setSamplingPriorityLocked(p float64) {
 	if t.locked {
 		return
@@ -246,11 +271,23 @@ func (t *trace) finishedOne(s *span) {
 	if len(t.spans) != t.finished {
 		return
 	}
-	if tr, ok := internal.GetGlobalTracer().(*tracer); ok {
-		// we have a tracer that can receive completed traces.
-		tr.pushTrace(t.spans)
-		atomic.AddInt64(&tr.spansFinished, int64(len(t.spans)))
+	defer func() {
+		t.spans = nil
+		t.finished = 0 // important, because a buffer can be used for several flushes
+	}()
+	tr, ok := internal.GetGlobalTracer().(*tracer)
+	if !ok {
+		return
+	}
+	// we have a tracer that can receive completed traces.
+	atomic.AddInt64(&tr.spansFinished, int64(len(t.spans)))
+	sd := samplingDecision(atomic.LoadInt64((*int64)(&t.samplingDecision)))
+	if sd != decisionKeep {
+		if p, ok := t.samplingPriorityLocked(); ok && p == ext.PriorityAutoReject {
+			atomic.AddUint64(&tr.droppedP0Spans, uint64(len(t.spans)))
+			atomic.AddUint64(&tr.droppedP0Traces, 1)
+		}
+		return
 	}
-	t.spans = nil
-	t.finished = 0 // important, because a buffer can be used for several flushes
+	tr.pushTrace(t.spans)
 }
diff --git a/ddtrace/tracer/spancontext_test.go b/ddtrace/tracer/spancontext_test.go
index 3d5ed72193..da0c4ae3c8 100644
--- a/ddtrace/tracer/spancontext_test.go
+++ b/ddtrace/tracer/spancontext_test.go
@@ -304,11 +304,8 @@ func TestSpanContextParent(t *testing.T) {
 			baggage:    map[string]string{"A": "A", "B": "B"},
 			hasBaggage: 1,
 			trace:      newTrace(),
-			drop:       true,
-		},
-		"nil-trace": &spanContext{
-			drop: true,
 		},
+		"nil-trace": &spanContext{},
 		"priority": &spanContext{
 			baggage:    map[string]string{"A": "A", "B": "B"},
 			hasBaggage: 1,
@@ -317,6 +314,14 @@ func TestSpanContextParent(t *testing.T) {
 				priority: func() *float64 { v := new(float64); *v = 2; return v }(),
 			},
 		},
+		"sampling_decision": &spanContext{
+			baggage:    map[string]string{"A": "A", "B": "B"},
+			hasBaggage: 1,
+			trace: &trace{
+				spans:            []*span{newBasicSpan("abc")},
+				samplingDecision: decisionKeep,
+			},
+		},
 		"origin": &spanContext{
 			trace:  &trace{spans: []*span{newBasicSpan("abc")}},
 			origin: "synthetics",
@@ -334,8 +339,8 @@ func TestSpanContextParent(t *testing.T) {
 			assert.Contains(ctx.trace.spans, s)
 			if parentCtx.trace != nil {
 				assert.Equal(ctx.trace.priority, parentCtx.trace.priority)
+				assert.Equal(ctx.trace.samplingDecision, parentCtx.trace.samplingDecision)
 			}
-			assert.Equal(parentCtx.drop, ctx.drop)
 			assert.Equal(parentCtx.baggage, ctx.baggage)
 			assert.Equal(parentCtx.origin, ctx.origin)
 		})
diff --git a/ddtrace/tracer/tracer.go b/ddtrace/tracer/tracer.go
index ba2c8576aa..06cc12f731 100644
--- a/ddtrace/tracer/tracer.go
+++ b/ddtrace/tracer/tracer.go
@@ -487,7 +487,7 @@ func (t *tracer) sample(span *span) {
 	}
 	sampler := t.config.sampler
 	if !sampler.Sample(span) {
-		span.context.drop = true
+		span.context.trace.drop()
 		return
 	}
 	if rs, ok := sampler.(RateSampler); ok && rs.Rate() < 1 {
diff --git a/ddtrace/tracer/tracer_test.go b/ddtrace/tracer/tracer_test.go
index 6be3ab10df..9f2b40cc07 100644
--- a/ddtrace/tracer/tracer_test.go
+++ b/ddtrace/tracer/tracer_test.go
@@ -238,6 +238,66 @@ func TestTracerStartSpan(t *testing.T) {
 	})
 }
 
+func TestSamplingDecision(t *testing.T) {
+	t.Run("sampled", func(t *testing.T) {
+		tracer, _, _, stop := startTestTracer(t)
+		defer stop()
+		tracer.prioritySampling.defaultRate = 0
+		tracer.config.serviceName = "test_service"
+		span := tracer.StartSpan("name_1").(*span)
+		child := tracer.StartSpan("name_2", ChildOf(span.context))
+		child.Finish()
+		span.Finish()
+		assert.Equal(t, float64(ext.PriorityAutoReject), span.Metrics[keySamplingPriority])
+		assert.Equal(t, decisionKeep, span.context.trace.samplingDecision)
+	})
+
+	t.Run("dropped", func(t *testing.T) {
+		tracer, _, _, stop := startTestTracer(t)
+		defer stop()
+		tracer.features.DropP0s = true
+		tracer.prioritySampling.defaultRate = 0
+		tracer.config.serviceName = "test_service"
+		span := tracer.StartSpan("name_1").(*span)
+		child := tracer.StartSpan("name_2", ChildOf(span.context))
+		child.Finish()
+		span.Finish()
+		assert.Equal(t, float64(ext.PriorityAutoReject), span.Metrics[keySamplingPriority])
+		assert.Equal(t, decisionNone, span.context.trace.samplingDecision)
+	})
+
+	t.Run("events_sampled", func(t *testing.T) {
+		tracer, _, _, stop := startTestTracer(t)
+		defer stop()
+		tracer.features.DropP0s = true
+		tracer.prioritySampling.defaultRate = 0
+		tracer.config.serviceName = "test_service"
+		span := tracer.StartSpan("name_1").(*span)
+		child := tracer.StartSpan("name_2", ChildOf(span.context))
+		child.SetTag(ext.EventSampleRate, 1)
+		child.Finish()
+		span.Finish()
+		assert.Equal(t, float64(ext.PriorityAutoReject), span.Metrics[keySamplingPriority])
+		assert.Equal(t, decisionKeep, span.context.trace.samplingDecision)
+	})
+
+	t.Run("client_dropped", func(t *testing.T) {
+		tracer, _, _, stop := startTestTracer(t)
+		defer stop()
+		tracer.features.DropP0s = true
+		tracer.config.sampler = NewRateSampler(0)
+		tracer.prioritySampling.defaultRate = 0
+		tracer.config.serviceName = "test_service"
+		span := tracer.StartSpan("name_1").(*span)
+		child := tracer.StartSpan("name_2", ChildOf(span.context))
+		child.SetTag(ext.EventSampleRate, 1)
+		child.Finish()
+		span.Finish()
+		assert.Equal(t, float64(ext.PriorityAutoReject), span.Metrics[keySamplingPriority])
+		assert.Equal(t, decisionDrop, span.context.trace.samplingDecision)
+	})
+}
+
 func TestTracerRuntimeMetrics(t *testing.T) {
 	t.Run("on", func(t *testing.T) {
 		tp := new(testLogger)