From 822d81bbc2555e86507db164dc079b2ed83248d9 Mon Sep 17 00:00:00 2001 From: Mikayla Toffler <46911781+mtoffl01@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:16:02 -0500 Subject: [PATCH] ddtrace/tracer: report datadog.tracer.api.errors health metric (#3024) --- ddtrace/tracer/transport.go | 30 ++++++++--- ddtrace/tracer/transport_test.go | 85 +++++++++++++++++++++++++++++++ internal/statsdtest/statsdtest.go | 14 +++++ 3 files changed, 123 insertions(+), 6 deletions(-) diff --git a/ddtrace/tracer/transport.go b/ddtrace/tracer/transport.go index a8ad9cfe8e..222f6c8b70 100644 --- a/ddtrace/tracer/transport.go +++ b/ddtrace/tracer/transport.go @@ -151,16 +151,18 @@ func (t *httpTransport) send(p *payload) (body io.ReadCloser, err error) { } req.Header.Set(traceCountHeader, strconv.Itoa(p.itemCount())) req.Header.Set(headerComputedTopLevel, "yes") - if t, ok := traceinternal.GetGlobalTracer().(*tracer); ok { - if t.config.tracingAsTransport || t.config.canComputeStats() { + var tr *tracer + var haveTracer bool + if tr, haveTracer = traceinternal.GetGlobalTracer().(*tracer); haveTracer { + if tr.config.tracingAsTransport || tr.config.canComputeStats() { // tracingAsTransport uses this header to disable the trace agent's stats computation // while making canComputeStats() always false to also disable client stats computation. req.Header.Set("Datadog-Client-Computed-Stats", "yes") } - droppedTraces := int(atomic.SwapUint32(&t.droppedP0Traces, 0)) - partialTraces := int(atomic.SwapUint32(&t.partialTraces, 0)) - droppedSpans := int(atomic.SwapUint32(&t.droppedP0Spans, 0)) - if stats := t.statsd; stats != nil { + droppedTraces := int(atomic.SwapUint32(&tr.droppedP0Traces, 0)) + partialTraces := int(atomic.SwapUint32(&tr.partialTraces, 0)) + droppedSpans := int(atomic.SwapUint32(&tr.droppedP0Spans, 0)) + if stats := tr.statsd; stats != nil { stats.Count("datadog.tracer.dropped_p0_traces", int64(droppedTraces), []string{fmt.Sprintf("partial:%s", strconv.FormatBool(partialTraces > 0))}, 1) stats.Count("datadog.tracer.dropped_p0_spans", int64(droppedSpans), nil, 1) @@ -170,9 +172,11 @@ func (t *httpTransport) send(p *payload) (body io.ReadCloser, err error) { } response, err := t.client.Do(req) if err != nil { + reportAPIErrorsMetric(haveTracer, response, err, tr) return nil, err } if code := response.StatusCode; code >= 400 { + reportAPIErrorsMetric(haveTracer, response, err, tr) // error, check the body for context information and // return a nice error. msg := make([]byte, 1000) @@ -187,6 +191,20 @@ func (t *httpTransport) send(p *payload) (body io.ReadCloser, err error) { return response.Body, nil } +func reportAPIErrorsMetric(haveTracer bool, response *http.Response, err error, t *tracer) { + if !haveTracer { + return + } + var reason string + if err != nil { + reason = "network_failure" + } + if response != nil { + reason = fmt.Sprintf("server_response_%d", response.StatusCode) + } + t.statsd.Incr("datadog.tracer.api.errors", []string{"reason:" + reason}, 1) +} + func (t *httpTransport) endpoint() string { return t.traceURL } diff --git a/ddtrace/tracer/transport_test.go b/ddtrace/tracer/transport_test.go index 8c1148277c..b8967713cf 100644 --- a/ddtrace/tracer/transport_test.go +++ b/ddtrace/tracer/transport_test.go @@ -20,7 +20,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + traceinternal "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/internal" "gopkg.in/DataDog/dd-trace-go.v1/internal" + "gopkg.in/DataDog/dd-trace-go.v1/internal/statsdtest" ) // getTestSpan returns a Span with different fields set @@ -241,6 +243,89 @@ func TestCustomTransport(t *testing.T) { assert.Equal(hits, 1) } +type ErrTransport struct{} + +func (t *ErrTransport) RoundTrip(req *http.Request) (*http.Response, error) { + return nil, fmt.Errorf("error in RoundTripper") +} + +type ErrResponseTransport struct{} + +func (t *ErrResponseTransport) RoundTrip(req *http.Request) (*http.Response, error) { + return &http.Response{StatusCode: 400}, nil +} + +type OkTransport struct{} + +func (t *OkTransport) RoundTrip(req *http.Request) (*http.Response, error) { + return &http.Response{StatusCode: 200}, nil +} + +func TestApiErrorsMetric(t *testing.T) { + t.Run("error", func(t *testing.T) { + assert := assert.New(t) + c := &http.Client{ + Transport: &ErrTransport{}, + } + var tg statsdtest.TestStatsdClient + trc := newTracer(WithHTTPClient(c), withStatsdClient(&tg)) + traceinternal.SetGlobalTracer(trc) + defer trc.Stop() + + p, err := encode(getTestTrace(1, 1)) + assert.NoError(err) + + // We're expecting an error + _, err = trc.config.transport.send(p) + assert.Error(err) + calls := statsdtest.FilterCallsByName(tg.IncrCalls(), "datadog.tracer.api.errors") + assert.Len(calls, 1) + call := calls[0] + assert.Equal([]string{"reason:network_failure"}, call.Tags()) + + }) + t.Run("response with err code", func(t *testing.T) { + assert := assert.New(t) + c := &http.Client{ + Transport: &ErrResponseTransport{}, + } + var tg statsdtest.TestStatsdClient + trc := newTracer(WithHTTPClient(c), withStatsdClient(&tg)) + traceinternal.SetGlobalTracer(trc) + defer trc.Stop() + + p, err := encode(getTestTrace(1, 1)) + assert.NoError(err) + + _, err = trc.config.transport.send(p) + assert.Error(err) + + calls := statsdtest.FilterCallsByName(tg.IncrCalls(), "datadog.tracer.api.errors") + assert.Len(calls, 1) + call := calls[0] + assert.Equal([]string{"reason:server_response_400"}, call.Tags()) + }) + t.Run("successful send - no metric", func(t *testing.T) { + assert := assert.New(t) + var tg statsdtest.TestStatsdClient + c := &http.Client{ + Transport: &OkTransport{}, + } + trc := newTracer(WithHTTPClient(c), withStatsdClient(&tg)) + traceinternal.SetGlobalTracer(trc) + defer trc.Stop() + + p, err := encode(getTestTrace(1, 1)) + assert.NoError(err) + + _, err = trc.config.transport.send(p) + assert.NoError(err) + + calls := statsdtest.FilterCallsByName(tg.IncrCalls(), "datadog.tracer.api.errors") + assert.Len(calls, 0) + }) +} + func TestWithHTTPClient(t *testing.T) { // disable instrumentation telemetry to prevent flaky number of requests t.Setenv("DD_INSTRUMENTATION_TELEMETRY_ENABLED", "false") diff --git a/internal/statsdtest/statsdtest.go b/internal/statsdtest/statsdtest.go index ffcd700cd1..bed3646fbc 100644 --- a/internal/statsdtest/statsdtest.go +++ b/internal/statsdtest/statsdtest.go @@ -49,6 +49,10 @@ type TestStatsdCall struct { rate float64 } +func (c *TestStatsdCall) Tags() []string { + return c.tags +} + func (tg *TestStatsdClient) addCount(name string, value int64) { tg.mu.Lock() defer tg.mu.Unlock() @@ -221,6 +225,16 @@ func (tg *TestStatsdClient) CallsByName() map[string]int { return counts } +func FilterCallsByName(calls []TestStatsdCall, name string) []TestStatsdCall { + var matches []TestStatsdCall + for _, c := range calls { + if c.name == name { + matches = append(matches, c) + } + } + return matches +} + func (tg *TestStatsdClient) Counts() map[string]int64 { tg.mu.RLock() defer tg.mu.RUnlock()