From beb5a530e37825f2933d6e31120a0795b4657972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Tue, 14 Dec 2021 14:28:14 +0100 Subject: [PATCH] profiler: Code Hotspots and Endpoint Filtering (#966) tracer: Profiler Endpoints and Code Hotspots Implement profiler endpoints and code hotspots feature. When enabled, the tracer automatically applies "span id", "local root span id" and "trace endpoint" pprof labels to goroutines as spans are started and finished. These labels end up in the the CPU profiles uploaded by the profilers and are leveraged by our backend to connect APM Spans to profiling information and vice versa. The feature is disabled by default for now, but this might change after some further testing. New API: - tracer.WithProfilerCodeHotspots() option - tracer.WithProfilerEndpoints() option - StartSpanFromContext() added to tracer.Tracer interface New env vars: - DD_PROFILING_ENDPOINT_COLLECTION_ENABLED - DD_PROFILING_CODE_HOTSPOTS_COLLECTION_ENABLED --- checkcopyright.go | 5 +- ddtrace/ddtrace.go | 11 +- ddtrace/internal/globaltracer.go | 6 + ddtrace/mocktracer/mocktracer.go | 14 +- ddtrace/tracer/context.go | 10 +- ddtrace/tracer/option.go | 34 +++ ddtrace/tracer/option_test.go | 29 ++ ddtrace/tracer/span.go | 14 +- ddtrace/tracer/tracer.go | 69 ++++- internal/traceprof/testapp/test_app.go | 8 + internal/traceprof/testapp/test_app.pb.go | 242 +++++++++++++++ internal/traceprof/testapp/test_app.proto | 24 ++ .../traceprof/testapp/test_app_grpc.pb.go | 102 +++++++ internal/traceprof/traceprof.go | 20 ++ internal/traceprof/traceproftest/app.go | 285 ++++++++++++++++++ internal/traceprof/traceproftest/baseline.txt | 51 ++++ internal/traceprof/traceproftest/bench.bash | 18 ++ .../traceprof/traceproftest/cpu_profiler.go | 135 +++++++++ .../traceproftest/endpoints-and-hotspots.txt | 51 ++++ internal/traceprof/traceproftest/overhead.txt | 43 +++ .../traceprof/traceproftest/traceprof_test.go | 228 ++++++++++++++ .../traceprof/traceproftest/traceproftest.go | 9 + internal/traceprof/traceproftest/util.go | 37 +++ profiler/profile_test.go | 5 +- 24 files changed, 1432 insertions(+), 18 deletions(-) create mode 100644 internal/traceprof/testapp/test_app.go create mode 100644 internal/traceprof/testapp/test_app.pb.go create mode 100644 internal/traceprof/testapp/test_app.proto create mode 100644 internal/traceprof/testapp/test_app_grpc.pb.go create mode 100644 internal/traceprof/traceprof.go create mode 100644 internal/traceprof/traceproftest/app.go create mode 100644 internal/traceprof/traceproftest/baseline.txt create mode 100755 internal/traceprof/traceproftest/bench.bash create mode 100644 internal/traceprof/traceproftest/cpu_profiler.go create mode 100644 internal/traceprof/traceproftest/endpoints-and-hotspots.txt create mode 100644 internal/traceprof/traceproftest/overhead.txt create mode 100644 internal/traceprof/traceproftest/traceprof_test.go create mode 100644 internal/traceprof/traceproftest/traceproftest.go create mode 100644 internal/traceprof/traceproftest/util.go diff --git a/checkcopyright.go b/checkcopyright.go index 678b5d246b..f0f9849eb0 100644 --- a/checkcopyright.go +++ b/checkcopyright.go @@ -3,6 +3,7 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/). // Copyright 2016 Datadog, Inc. +//go:build ignore // +build ignore // This tool validates that all *.go files in the repository have the copyright text attached. @@ -22,6 +23,8 @@ func main() { // copyrightRegexp matches years or year ranges like "2016", "2016-2019", // "2016,2018-2020" in the copyright header. copyrightRegexp := regexp.MustCompile(`// Copyright 20[0-9]{2}[0-9,\-]* Datadog, Inc.`) + generatedRegexp := regexp.MustCompile(`Code generated by.+DO NOT EDIT`) + if err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error { if err != nil { return err @@ -40,7 +43,7 @@ func main() { if err != nil && err != io.EOF { return err } - if !copyrightRegexp.Match(snip) { + if !copyrightRegexp.Match(snip) && !generatedRegexp.Match(snip) { // report missing header missing = true log.Printf("Copyright header missing in %q.\n", path) diff --git a/ddtrace/ddtrace.go b/ddtrace/ddtrace.go index 9fe41b1eb1..9aeafebe8d 100644 --- a/ddtrace/ddtrace.go +++ b/ddtrace/ddtrace.go @@ -13,7 +13,10 @@ // with by accessing the subdirectories of this package: https://godoc.org/gopkg.in/DataDog/dd-trace-go.v1/ddtrace#pkg-subdirectories. package ddtrace // import "gopkg.in/DataDog/dd-trace-go.v1/ddtrace" -import "time" +import ( + "context" + "time" +) // Tracer specifies an implementation of the Datadog tracer which allows starting // and propagating spans. The official implementation if exposed as functions @@ -22,6 +25,12 @@ type Tracer interface { // StartSpan starts a span with the given operation name and options. StartSpan(operationName string, opts ...StartSpanOption) Span + // StartSpanFromContext starts a span with the given operation name and + // options. If a span is found in the context, it will be used as the parent + // of the resulting span. If the ChildOf option is passed, the span from + // context will take precedence over it as the parent span. + StartSpanFromContext(ctx context.Context, operationName string, opts ...StartSpanOption) (Span, context.Context) + // Extract extracts a span context from a given carrier. Note that baggage item // keys will always be lower-cased to maintain consistency. It is impossible to // maintain the original casing due to MIME header canonicalization standards. diff --git a/ddtrace/internal/globaltracer.go b/ddtrace/internal/globaltracer.go index 239e976267..c086d0ee01 100644 --- a/ddtrace/internal/globaltracer.go +++ b/ddtrace/internal/globaltracer.go @@ -6,6 +6,7 @@ package internal // import "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/internal" import ( + "context" "sync" "gopkg.in/DataDog/dd-trace-go.v1/ddtrace" @@ -51,6 +52,11 @@ func (NoopTracer) StartSpan(operationName string, opts ...ddtrace.StartSpanOptio return NoopSpan{} } +// StartSpanFromContext implements ddtrace.Tracer. +func (NoopTracer) StartSpanFromContext(ctx context.Context, operationName string, options ...ddtrace.StartSpanOption) (ddtrace.Span, context.Context) { + return NoopSpan{}, context.Background() +} + // SetServiceInfo implements ddtrace.Tracer. func (NoopTracer) SetServiceInfo(name, app, appType string) {} diff --git a/ddtrace/mocktracer/mocktracer.go b/ddtrace/mocktracer/mocktracer.go index bcdb6953e9..8edda50181 100644 --- a/ddtrace/mocktracer/mocktracer.go +++ b/ddtrace/mocktracer/mocktracer.go @@ -13,6 +13,7 @@ package mocktracer import ( + "context" "strconv" "strings" "sync" @@ -73,17 +74,28 @@ func (*mocktracer) Stop() { } func (t *mocktracer) StartSpan(operationName string, opts ...ddtrace.StartSpanOption) ddtrace.Span { + span, _ := t.StartSpanFromContext(context.Background(), operationName, opts...) + return span +} + +func (t *mocktracer) StartSpanFromContext(ctx context.Context, operationName string, opts ...ddtrace.StartSpanOption) (ddtrace.Span, context.Context) { var cfg ddtrace.StartSpanConfig for _, fn := range opts { fn(&cfg) } + if ctx == nil { + ctx = context.Background() + } else if s, ok := tracer.SpanFromContext(ctx); ok { + // span in ctx overwrite ChildOf() parent if any + cfg.Parent = s.Context() + } span := newSpan(t, operationName, &cfg) t.Lock() t.openSpans[span.SpanID()] = span t.Unlock() - return span + return span, tracer.ContextWithSpan(ctx, span) } func (t *mocktracer) OpenSpans() []Span { diff --git a/ddtrace/tracer/context.go b/ddtrace/tracer/context.go index 0add06e99a..a8c0edfba5 100644 --- a/ddtrace/tracer/context.go +++ b/ddtrace/tracer/context.go @@ -39,13 +39,5 @@ func SpanFromContext(ctx context.Context) (Span, bool) { // is found in the context, it will be used as the parent of the resulting span. If the ChildOf // option is passed, the span from context will take precedence over it as the parent span. func StartSpanFromContext(ctx context.Context, operationName string, opts ...StartSpanOption) (Span, context.Context) { - if ctx == nil { - // default to context.Background() to avoid panics on Go >= 1.15 - ctx = context.Background() - } - if s, ok := SpanFromContext(ctx); ok { - opts = append(opts, ChildOf(s.Context())) - } - s := StartSpan(operationName, opts...) - return s, ContextWithSpan(ctx, s) + return internal.GetGlobalTracer().StartSpanFromContext(ctx, operationName, opts...) } diff --git a/ddtrace/tracer/option.go b/ddtrace/tracer/option.go index 93c490405b..f055679c9b 100644 --- a/ddtrace/tracer/option.go +++ b/ddtrace/tracer/option.go @@ -24,6 +24,7 @@ import ( "gopkg.in/DataDog/dd-trace-go.v1/internal" "gopkg.in/DataDog/dd-trace-go.v1/internal/globalconfig" "gopkg.in/DataDog/dd-trace-go.v1/internal/log" + "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof" "gopkg.in/DataDog/dd-trace-go.v1/internal/version" "github.com/DataDog/datadog-go/v5/statsd" @@ -122,6 +123,12 @@ type config struct { // errors will record a stack trace when this option is set. noDebugStack bool + // profilerHotspots specifies whether profiler Code Hotspots is enabled. + profilerHotspots bool + + // profilerEndpoints specifies whether profiler endpoint filtering is enabled. + profilerEndpoints bool + // enabled reports whether tracing is enabled. enabled bool } @@ -213,6 +220,9 @@ func newConfig(opts ...StartOption) *config { c.runtimeMetrics = internal.BoolEnv("DD_RUNTIME_METRICS_ENABLED", false) c.debug = internal.BoolEnv("DD_TRACE_DEBUG", false) c.enabled = internal.BoolEnv("DD_TRACE_ENABLED", true) + // TODO(fg): set these to true before going GA with this. + c.profilerEndpoints = internal.BoolEnv(traceprof.EndpointEnvVar, false) + c.profilerHotspots = internal.BoolEnv(traceprof.CodeHotspotsEnvVar, false) for _, fn := range opts { fn(c) @@ -665,6 +675,30 @@ func WithLogStartup(enabled bool) StartOption { } } +// WithProfilerCodeHotspots enables the code hotspots integration between the +// tracer and profiler. This is done by automatically attaching pprof labels +// called "span id" and "local root span id" when new spans are created. You +// should not use these label names in your own code when this is enabled. The +// enabled value defaults to the value of the +// DD_PROFILING_CODE_HOTSPOTS_COLLECTION_ENABLED env variable or false. +func WithProfilerCodeHotspots(enabled bool) StartOption { + return func(c *config) { + c.profilerHotspots = enabled + } +} + +// WithProfilerEndpoints enables the endpoints integration between the tracer +// and profiler. This is done by automatically attaching a pprof label called +// "trace endpoint" holding the resource name of the top-level service span if +// its type is http or rpc. You should not use this label name in your own code +// when this is enabled. The enabled value defaults to the value of the +// DD_PROFILING_ENDPOINT_COLLECTION_ENABLED env variable or false. +func WithProfilerEndpoints(enabled bool) StartOption { + return func(c *config) { + c.profilerEndpoints = enabled + } +} + // StartSpanOption is a configuration option for StartSpan. It is aliased in order // to help godoc group all the functions returning it together. It is considered // more correct to refer to it as the type as the origin, ddtrace.StartSpanOption. diff --git a/ddtrace/tracer/option_test.go b/ddtrace/tracer/option_test.go index 592b683559..410b4e43ee 100644 --- a/ddtrace/tracer/option_test.go +++ b/ddtrace/tracer/option_test.go @@ -20,6 +20,7 @@ import ( "time" "gopkg.in/DataDog/dd-trace-go.v1/internal/globalconfig" + "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -363,6 +364,34 @@ func TestTracerOptionsDefaults(t *testing.T) { assert.Equal(nil, dVal) }) + t.Run("profiler-endpoints", func(t *testing.T) { + t.Run("default", func(t *testing.T) { + c := newConfig() + assert.False(t, c.profilerEndpoints) + }) + + t.Run("override", func(t *testing.T) { + os.Setenv(traceprof.EndpointEnvVar, "true") + defer os.Unsetenv(traceprof.EndpointEnvVar) + c := newConfig() + assert.True(t, c.profilerEndpoints) + }) + }) + + t.Run("profiler-hotspots", func(t *testing.T) { + t.Run("default", func(t *testing.T) { + c := newConfig() + assert.False(t, c.profilerHotspots) + }) + + t.Run("override", func(t *testing.T) { + os.Setenv(traceprof.CodeHotspotsEnvVar, "true") + defer os.Unsetenv(traceprof.CodeHotspotsEnvVar) + c := newConfig() + assert.True(t, c.profilerHotspots) + }) + }) + t.Run("env-mapping", func(t *testing.T) { os.Setenv("DD_SERVICE_MAPPING", "tracer.test:test2, svc:Newsvc,http.router:myRouter, noval:") defer os.Unsetenv("DD_SERVICE_MAPPING") diff --git a/ddtrace/tracer/span.go b/ddtrace/tracer/span.go index c6d0162c53..630271989f 100644 --- a/ddtrace/tracer/span.go +++ b/ddtrace/tracer/span.go @@ -8,10 +8,12 @@ package tracer import ( + "context" "fmt" "os" "reflect" "runtime" + "runtime/pprof" "strconv" "strings" "sync" @@ -72,7 +74,11 @@ type span struct { noDebugStack bool `msg:"-"` // disables debug stack traces finished bool `msg:"-"` // true if the span has been submitted to a tracer. context *spanContext `msg:"-"` // span propagation context - taskEnd func() // ends execution tracer (runtime/trace) task, if started + + pprofCtxActive context.Context `msg:"-"` // contains pprof.WithLabel labels to tell the profiler more about this span + pprofCtxRestore context.Context `msg:"-"` // contains pprof.WithLabel labels of the parent span (if any) that need to be restored when this span finishes + + taskEnd func() // ends execution tracer (runtime/trace) task, if started } // Context yields the SpanContext for this Span. Note that the return @@ -320,6 +326,12 @@ func (s *span) Finish(opts ...ddtrace.FinishOption) { s.taskEnd() } s.finish(t) + + if s.pprofCtxRestore != nil { + // Restore the labels of the parent span so any CPU samples after this + // point are attributed correctly. + pprof.SetGoroutineLabels(s.pprofCtxRestore) + } } // SetOperationName sets or changes the operation name. diff --git a/ddtrace/tracer/tracer.go b/ddtrace/tracer/tracer.go index e64502d098..1f7e4cc737 100644 --- a/ddtrace/tracer/tracer.go +++ b/ddtrace/tracer/tracer.go @@ -6,8 +6,11 @@ package tracer import ( + "context" + gocontext "context" "fmt" "os" + "runtime/pprof" "strconv" "sync" "time" @@ -17,6 +20,7 @@ import ( "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/internal" "gopkg.in/DataDog/dd-trace-go.v1/internal/appsec" "gopkg.in/DataDog/dd-trace-go.v1/internal/log" + "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof" "gopkg.in/DataDog/dd-trace-go.v1/internal/version" "github.com/DataDog/datadog-agent/pkg/obfuscate" @@ -311,12 +315,24 @@ func (t *tracer) pushTrace(trace []*span) { } } -// StartSpan creates, starts, and returns a new Span with the given `operationName`. +// StartSpan implements ddtrace.Tracer. func (t *tracer) StartSpan(operationName string, options ...ddtrace.StartSpanOption) ddtrace.Span { + span, _ := t.StartSpanFromContext(gocontext.Background(), operationName, options...) + return span +} + +// StartSpanFromContext implements ddtrace.Tracer. +func (t *tracer) StartSpanFromContext(ctx gocontext.Context, operationName string, options ...ddtrace.StartSpanOption) (ddtrace.Span, gocontext.Context) { var opts ddtrace.StartSpanConfig for _, fn := range options { fn(&opts) } + if ctx == nil { + ctx = gocontext.Background() + } else if s, ok := SpanFromContext(ctx); ok { + // span in ctx overwrite ChildOf() parent if any + opts.Parent = s.Context() + } var startTime int64 if opts.StartTime.IsZero() { startTime = now() @@ -324,9 +340,17 @@ func (t *tracer) StartSpan(operationName string, options ...ddtrace.StartSpanOpt startTime = opts.StartTime.UnixNano() } var context *spanContext + pprofCtx := ctx if opts.Parent != nil { - if ctx, ok := opts.Parent.(*spanContext); ok { - context = ctx + if parentContext, ok := opts.Parent.(*spanContext); ok { + context = parentContext + if pprofCtx == gocontext.Background() && parentContext.span != nil && parentContext.span.pprofCtxActive != nil { + // Inherit the pprof labels from parent span if it was propagated using + // ChildOf() rather than StartSpanFromContext(). Having a separate ctx + // and pprofCtx is done to avoid subtle problems with callers relying + // on the details of the ContextWithSpan() wrapping below. + pprofCtx = parentContext.span.pprofCtxActive + } } } id := opts.SpanID @@ -400,13 +424,50 @@ func (t *tracer) StartSpan(operationName string, options ...ddtrace.StartSpanOpt // if not already sampled or a brand new trace, sample it t.sample(span) } + if t.config.profilerHotspots || t.config.profilerEndpoints { + ctx = t.applyPPROFLabels(pprofCtx, span) + } if t.config.serviceMappings != nil { if newSvc, ok := t.config.serviceMappings[span.Service]; ok { span.Service = newSvc } } log.Debug("Started Span: %v, Operation: %s, Resource: %s, Tags: %v, %v", span, span.Name, span.Resource, span.Meta, span.Metrics) - return span + return span, ContextWithSpan(ctx, span) +} + +// applyPPROFLabels applies pprof labels for the profiler's code hotspots and +// endpoint filtering feature to span. When span finishes, any pprof labels +// found in ctx are restored. +func (t *tracer) applyPPROFLabels(ctx gocontext.Context, span *span) context.Context { + var labels []string + if t.config.profilerHotspots { + labels = append(labels, traceprof.SpanID, strconv.FormatUint(span.SpanID, 10)) + } + // nil checks might not be needed, but better be safe than sorry + if span.context.trace != nil && span.context.trace.root != nil { + localRootSpan := span.context.trace.root + if t.config.profilerHotspots { + labels = append(labels, traceprof.LocalRootSpanID, strconv.FormatUint(localRootSpan.SpanID, 10)) + } + if t.config.profilerEndpoints && spanResourcePIISafe(localRootSpan) { + labels = append(labels, traceprof.TraceEndpoint, localRootSpan.Resource) + } + } + if len(labels) > 0 { + span.pprofCtxRestore = ctx + span.pprofCtxActive = pprof.WithLabels(ctx, pprof.Labels(labels...)) + pprof.SetGoroutineLabels(span.pprofCtxActive) + return span.pprofCtxActive + } + return ctx +} + +// spanResourcePIISafe returns true if s.Resource can be considered to not +// include PII with reasonable confidence. E.g. SQL queries may contain PII, +// but http or rpc endpoint names generally do not. +func spanResourcePIISafe(s *span) bool { + return s.Type == ext.SpanTypeWeb || s.Type == ext.AppTypeRPC } // Stop stops the tracer. diff --git a/internal/traceprof/testapp/test_app.go b/internal/traceprof/testapp/test_app.go new file mode 100644 index 0000000000..bc66d4f4e5 --- /dev/null +++ b/internal/traceprof/testapp/test_app.go @@ -0,0 +1,8 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +// Package testapp has the protbuf/grpc definitions for the test application +// implemented in traceproftest. +package testapp diff --git a/internal/traceprof/testapp/test_app.pb.go b/internal/traceprof/testapp/test_app.pb.go new file mode 100644 index 0000000000..06ba2cc07e --- /dev/null +++ b/internal/traceprof/testapp/test_app.pb.go @@ -0,0 +1,242 @@ +// +//Run this generate Go code after changing things below: +// +//protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative internal/traceprof/testapp/test_app.proto + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.27.1 +// protoc v3.6.1 +// source: internal/traceprof/testapp/test_app.proto + +package testapp + +import ( + reflect "reflect" + sync "sync" + + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type WorkReq struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CpuDuration int64 `protobuf:"varint,1,opt,name=cpu_duration,json=cpuDuration,proto3" json:"cpu_duration,omitempty"` + SqlDuration int64 `protobuf:"varint,2,opt,name=sql_duration,json=sqlDuration,proto3" json:"sql_duration,omitempty"` +} + +func (x *WorkReq) Reset() { + *x = WorkReq{} + if protoimpl.UnsafeEnabled { + mi := &file_internal_traceprof_testapp_test_app_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *WorkReq) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*WorkReq) ProtoMessage() {} + +func (x *WorkReq) ProtoReflect() protoreflect.Message { + mi := &file_internal_traceprof_testapp_test_app_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use WorkReq.ProtoReflect.Descriptor instead. +func (*WorkReq) Descriptor() ([]byte, []int) { + return file_internal_traceprof_testapp_test_app_proto_rawDescGZIP(), []int{0} +} + +func (x *WorkReq) GetCpuDuration() int64 { + if x != nil { + return x.CpuDuration + } + return 0 +} + +func (x *WorkReq) GetSqlDuration() int64 { + if x != nil { + return x.SqlDuration + } + return 0 +} + +type WorkRes struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + SpanId string `protobuf:"bytes,1,opt,name=span_id,json=spanId,proto3" json:"span_id,omitempty"` + LocalRootSpanId string `protobuf:"bytes,2,opt,name=local_root_span_id,json=localRootSpanId,proto3" json:"local_root_span_id,omitempty"` +} + +func (x *WorkRes) Reset() { + *x = WorkRes{} + if protoimpl.UnsafeEnabled { + mi := &file_internal_traceprof_testapp_test_app_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *WorkRes) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*WorkRes) ProtoMessage() {} + +func (x *WorkRes) ProtoReflect() protoreflect.Message { + mi := &file_internal_traceprof_testapp_test_app_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use WorkRes.ProtoReflect.Descriptor instead. +func (*WorkRes) Descriptor() ([]byte, []int) { + return file_internal_traceprof_testapp_test_app_proto_rawDescGZIP(), []int{1} +} + +func (x *WorkRes) GetSpanId() string { + if x != nil { + return x.SpanId + } + return "" +} + +func (x *WorkRes) GetLocalRootSpanId() string { + if x != nil { + return x.LocalRootSpanId + } + return "" +} + +var File_internal_traceprof_testapp_test_app_proto protoreflect.FileDescriptor + +var file_internal_traceprof_testapp_test_app_proto_rawDesc = []byte{ + 0x0a, 0x29, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x2f, 0x74, 0x72, 0x61, 0x63, 0x65, + 0x70, 0x72, 0x6f, 0x66, 0x2f, 0x74, 0x65, 0x73, 0x74, 0x61, 0x70, 0x70, 0x2f, 0x74, 0x65, 0x73, + 0x74, 0x5f, 0x61, 0x70, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x07, 0x74, 0x65, 0x73, + 0x74, 0x61, 0x70, 0x70, 0x22, 0x4f, 0x0a, 0x07, 0x57, 0x6f, 0x72, 0x6b, 0x52, 0x65, 0x71, 0x12, + 0x21, 0x0a, 0x0c, 0x63, 0x70, 0x75, 0x5f, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x63, 0x70, 0x75, 0x44, 0x75, 0x72, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x12, 0x21, 0x0a, 0x0c, 0x73, 0x71, 0x6c, 0x5f, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x73, 0x71, 0x6c, 0x44, 0x75, 0x72, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x4f, 0x0a, 0x07, 0x57, 0x6f, 0x72, 0x6b, 0x52, 0x65, 0x73, + 0x12, 0x17, 0x0a, 0x07, 0x73, 0x70, 0x61, 0x6e, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x06, 0x73, 0x70, 0x61, 0x6e, 0x49, 0x64, 0x12, 0x2b, 0x0a, 0x12, 0x6c, 0x6f, 0x63, + 0x61, 0x6c, 0x5f, 0x72, 0x6f, 0x6f, 0x74, 0x5f, 0x73, 0x70, 0x61, 0x6e, 0x5f, 0x69, 0x64, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x52, 0x6f, 0x6f, 0x74, + 0x53, 0x70, 0x61, 0x6e, 0x49, 0x64, 0x32, 0x37, 0x0a, 0x07, 0x54, 0x65, 0x73, 0x74, 0x41, 0x70, + 0x70, 0x12, 0x2c, 0x0a, 0x04, 0x57, 0x6f, 0x72, 0x6b, 0x12, 0x10, 0x2e, 0x74, 0x65, 0x73, 0x74, + 0x61, 0x70, 0x70, 0x2e, 0x57, 0x6f, 0x72, 0x6b, 0x52, 0x65, 0x71, 0x1a, 0x10, 0x2e, 0x74, 0x65, + 0x73, 0x74, 0x61, 0x70, 0x70, 0x2e, 0x57, 0x6f, 0x72, 0x6b, 0x52, 0x65, 0x73, 0x22, 0x00, 0x42, + 0x3c, 0x5a, 0x3a, 0x67, 0x6f, 0x70, 0x6b, 0x67, 0x2e, 0x69, 0x6e, 0x2f, 0x44, 0x61, 0x74, 0x61, + 0x44, 0x6f, 0x67, 0x2f, 0x64, 0x64, 0x2d, 0x74, 0x72, 0x61, 0x63, 0x65, 0x2d, 0x67, 0x6f, 0x2e, + 0x76, 0x31, 0x2f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x2f, 0x74, 0x72, 0x61, 0x63, + 0x65, 0x70, 0x72, 0x6f, 0x66, 0x2f, 0x74, 0x65, 0x73, 0x74, 0x61, 0x70, 0x70, 0x62, 0x06, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_internal_traceprof_testapp_test_app_proto_rawDescOnce sync.Once + file_internal_traceprof_testapp_test_app_proto_rawDescData = file_internal_traceprof_testapp_test_app_proto_rawDesc +) + +func file_internal_traceprof_testapp_test_app_proto_rawDescGZIP() []byte { + file_internal_traceprof_testapp_test_app_proto_rawDescOnce.Do(func() { + file_internal_traceprof_testapp_test_app_proto_rawDescData = protoimpl.X.CompressGZIP(file_internal_traceprof_testapp_test_app_proto_rawDescData) + }) + return file_internal_traceprof_testapp_test_app_proto_rawDescData +} + +var file_internal_traceprof_testapp_test_app_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_internal_traceprof_testapp_test_app_proto_goTypes = []interface{}{ + (*WorkReq)(nil), // 0: testapp.WorkReq + (*WorkRes)(nil), // 1: testapp.WorkRes +} +var file_internal_traceprof_testapp_test_app_proto_depIdxs = []int32{ + 0, // 0: testapp.TestApp.Work:input_type -> testapp.WorkReq + 1, // 1: testapp.TestApp.Work:output_type -> testapp.WorkRes + 1, // [1:2] is the sub-list for method output_type + 0, // [0:1] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_internal_traceprof_testapp_test_app_proto_init() } +func file_internal_traceprof_testapp_test_app_proto_init() { + if File_internal_traceprof_testapp_test_app_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_internal_traceprof_testapp_test_app_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*WorkReq); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_internal_traceprof_testapp_test_app_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*WorkRes); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_internal_traceprof_testapp_test_app_proto_rawDesc, + NumEnums: 0, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_internal_traceprof_testapp_test_app_proto_goTypes, + DependencyIndexes: file_internal_traceprof_testapp_test_app_proto_depIdxs, + MessageInfos: file_internal_traceprof_testapp_test_app_proto_msgTypes, + }.Build() + File_internal_traceprof_testapp_test_app_proto = out.File + file_internal_traceprof_testapp_test_app_proto_rawDesc = nil + file_internal_traceprof_testapp_test_app_proto_goTypes = nil + file_internal_traceprof_testapp_test_app_proto_depIdxs = nil +} diff --git a/internal/traceprof/testapp/test_app.proto b/internal/traceprof/testapp/test_app.proto new file mode 100644 index 0000000000..763136be32 --- /dev/null +++ b/internal/traceprof/testapp/test_app.proto @@ -0,0 +1,24 @@ +/* +Run this generate Go code after changing things below: + +protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative internal/traceprof/testapp/test_app.proto +*/ + +syntax = "proto3"; + +option go_package = "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/testapp"; +package testapp; + +service TestApp { + rpc Work(WorkReq) returns (WorkRes) {} +} + +message WorkReq { + int64 cpu_duration = 1; + int64 sql_duration = 2; +} + +message WorkRes { + string span_id = 1; + string local_root_span_id = 2; +} diff --git a/internal/traceprof/testapp/test_app_grpc.pb.go b/internal/traceprof/testapp/test_app_grpc.pb.go new file mode 100644 index 0000000000..5b0cdce070 --- /dev/null +++ b/internal/traceprof/testapp/test_app_grpc.pb.go @@ -0,0 +1,102 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. + +package testapp + +import ( + context "context" + + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// TestAppClient is the client API for TestApp service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type TestAppClient interface { + Work(ctx context.Context, in *WorkReq, opts ...grpc.CallOption) (*WorkRes, error) +} + +type testAppClient struct { + cc grpc.ClientConnInterface +} + +func NewTestAppClient(cc grpc.ClientConnInterface) TestAppClient { + return &testAppClient{cc} +} + +func (c *testAppClient) Work(ctx context.Context, in *WorkReq, opts ...grpc.CallOption) (*WorkRes, error) { + out := new(WorkRes) + err := c.cc.Invoke(ctx, "/testapp.TestApp/Work", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// TestAppServer is the server API for TestApp service. +// All implementations must embed UnimplementedTestAppServer +// for forward compatibility +type TestAppServer interface { + Work(context.Context, *WorkReq) (*WorkRes, error) + mustEmbedUnimplementedTestAppServer() +} + +// UnimplementedTestAppServer must be embedded to have forward compatible implementations. +type UnimplementedTestAppServer struct { +} + +func (UnimplementedTestAppServer) Work(context.Context, *WorkReq) (*WorkRes, error) { + return nil, status.Errorf(codes.Unimplemented, "method Work not implemented") +} +func (UnimplementedTestAppServer) mustEmbedUnimplementedTestAppServer() {} + +// UnsafeTestAppServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to TestAppServer will +// result in compilation errors. +type UnsafeTestAppServer interface { + mustEmbedUnimplementedTestAppServer() +} + +func RegisterTestAppServer(s grpc.ServiceRegistrar, srv TestAppServer) { + s.RegisterService(&TestApp_ServiceDesc, srv) +} + +func _TestApp_Work_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(WorkReq) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(TestAppServer).Work(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/testapp.TestApp/Work", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(TestAppServer).Work(ctx, req.(*WorkReq)) + } + return interceptor(ctx, in, info, handler) +} + +// TestApp_ServiceDesc is the grpc.ServiceDesc for TestApp service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var TestApp_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "testapp.TestApp", + HandlerType: (*TestAppServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "Work", + Handler: _TestApp_Work_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "internal/traceprof/testapp/test_app.proto", +} diff --git a/internal/traceprof/traceprof.go b/internal/traceprof/traceprof.go new file mode 100644 index 0000000000..d2a7ec087f --- /dev/null +++ b/internal/traceprof/traceprof.go @@ -0,0 +1,20 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +// Package traceprof contains shared logic for cross-cutting tracer/profiler features. +package traceprof + +// pprof labels applied by the tracer to show up in the profiler's profiles. +const ( + SpanID = "span id" + LocalRootSpanID = "local root span id" + TraceEndpoint = "trace endpoint" +) + +// env variables used to control cross-cutting tracer/profiling features. +const ( + EndpointEnvVar = "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED" + CodeHotspotsEnvVar = "DD_PROFILING_CODE_HOTSPOTS_COLLECTION_ENABLED" +) diff --git a/internal/traceprof/traceproftest/app.go b/internal/traceprof/traceproftest/app.go new file mode 100644 index 0000000000..02e698c11e --- /dev/null +++ b/internal/traceprof/traceproftest/app.go @@ -0,0 +1,285 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +package traceproftest + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io/ioutil" + "net" + "net/http" + "net/http/httptest" + "runtime/pprof" + "testing" + "time" + + grpctrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/google.golang.org/grpc" + httptrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/julienschmidt/httprouter" + "gopkg.in/DataDog/dd-trace-go.v1/ddtrace" + "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/ext" + "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer" + "gopkg.in/DataDog/dd-trace-go.v1/internal/log" + pb "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/testapp" + + "github.com/julienschmidt/httprouter" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" +) + +const ( + // HTTPWorkEndpointMethod is the http method used for the demo app. + HTTPWorkEndpointMethod = "POST" + // HTTPWorkEndpoint is the http endpoint used for the demo app. + HTTPWorkEndpoint = "/work/:secret" + // GRPCWorkEndpoint is the grpc endpoint used for the demo app. + GRPCWorkEndpoint = "/testapp.TestApp/Work" +) + +// CustomLabels are the user-defined pprof labels to apply in the work endpoint +// to simulate user label interacting with our own labels. +var CustomLabels = map[string]string{"user label": "user val"} + +// AppConfig defines the behavior and profiling options for the demo app. +type AppConfig struct { + // Endpoints is passed to tracer.WithProfilerEndpoints() + Endpoints bool + // CodeHotspots is passed to tracer.WithProfilerCodeHotspots() + CodeHotspots bool + // ChildOf uses tracer.ChildOf() to declare the parent of cpuSpan instead of + // tracer.StartSpanFromContext(). + ChildOf bool + // AppType is the type of the test app that is being simulated. + AppType testAppType +} + +type testAppType string + +const ( + // Direct directly executes requests logic without any transport overhead. + Direct testAppType = "direct" + // GRPC executes requests via GRPC. + GRPC testAppType = "grpc" + // HTTP executes requests via HTTP. + HTTP testAppType = "http" +) + +// Endpoint returns the "trace endpoint" label value for this app type. +func (a testAppType) Endpoint() string { + switch a { + case Direct: + return "" + case GRPC: + return GRPCWorkEndpoint + case HTTP: + return HTTPWorkEndpointMethod + " " + HTTPWorkEndpoint + default: + panic("unreachable") + } +} + +// Start starts the demo app, including tracer and profiler. +func (c AppConfig) Start(t testing.TB) *App { + a := &App{config: c} + a.start(t) + return a +} + +// App is an instance of the demo app. +type App struct { + httpServer *httptest.Server + grpcServer *grpc.Server + grpcClientConn *grpc.ClientConn + CPUProfiler + prof *CPUProfile + config AppConfig + stopped bool + + pb.UnimplementedTestAppServer +} + +func (a *App) start(t testing.TB) { + tracer.Start( + tracer.WithLogger(log.DiscardLogger{}), + tracer.WithProfilerCodeHotspots(a.config.CodeHotspots), + tracer.WithProfilerEndpoints(a.config.Endpoints), + ) + + switch a.config.AppType { + case Direct: + // nothing to setup + case GRPC: + l, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + si := grpctrace.StreamServerInterceptor(grpctrace.WithServiceName("my-grpc-client")) + ui := grpctrace.UnaryServerInterceptor(grpctrace.WithServiceName("my-grpc-client")) + a.grpcServer = grpc.NewServer(grpc.StreamInterceptor(si), grpc.UnaryInterceptor(ui)) + pb.RegisterTestAppServer(a.grpcServer, a) + go a.grpcServer.Serve(l) + a.grpcClientConn, err = grpc.Dial(l.Addr().String(), grpc.WithInsecure()) + require.NoError(t, err) + case HTTP: + router := httptrace.New() + // We use a routing pattern here so our test can validate that potential + // Personal Identifiable Information (PII) values, in this case :secret, + // isn't beeing collected in the "trace endpoint" label. + router.Handle("POST", HTTPWorkEndpoint, a.workHandler) + a.httpServer = httptest.NewServer(router) + default: + panic("unreachable") + } + a.CPUProfiler.start(t) +} + +// Stop stops the app, tracer and cpu profiler in an idempotent fashion. +func (a *App) Stop(t testing.TB) { + if a.stopped { + return + } + a.prof = a.CPUProfiler.Stop(t) + tracer.Stop() + switch a.config.AppType { + case Direct: + // nothing to tear down + case GRPC: + a.grpcServer.GracefulStop() + a.grpcClientConn.Close() + case HTTP: + a.httpServer.Close() + default: + panic("unreachable") + } + a.stopped = true +} + +// WorkRequest sends the given req to the demo app and returns the response. +// The config.AppType determines how the request is made. +func (a *App) WorkRequest(t testing.TB, req *pb.WorkReq) *pb.WorkRes { + switch a.config.AppType { + case Direct: + res, err := a.Work(context.Background(), req) + require.NoError(t, err) + return res + case GRPC: + client := pb.NewTestAppClient(a.grpcClientConn) + res, err := client.Work(context.Background(), req) + require.NoError(t, err) + return res + case HTTP: + body, err := json.Marshal(req) + require.NoError(t, err) + url := a.httpServer.URL + "/work/secret-pii" + res, err := http.Post(url, "text/plain", bytes.NewReader(body)) + require.NoError(t, err) + + defer res.Body.Close() + workRes := &pb.WorkRes{} + require.NoError(t, json.NewDecoder(res.Body).Decode(&workRes)) + return workRes + default: + panic("unreachable") + } +} + +// CPUProfile stops the app and returns its CPU profile. +func (a *App) CPUProfile(t testing.TB) *CPUProfile { + a.Stop(t) + return a.prof +} + +func (a *App) workHandler(w http.ResponseWriter, r *http.Request, p httprouter.Params) { + req := &pb.WorkReq{} + if err := json.NewDecoder(r.Body).Decode(req); err != nil { + http.Error(w, "bad body", http.StatusBadRequest) + } + res, err := a.Work(r.Context(), req) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + json.NewEncoder(w).Encode(res) +} + +// Work implements the request handler for the demo app. It's reused regardless +// of the config.AppType. +func (a *App) Work(ctx context.Context, req *pb.WorkReq) (*pb.WorkRes, error) { + // Simulate user-defined custom labels to make sure we don't overwrite them + // when we apply our own labels. + ctx = pprof.WithLabels(ctx, toLabelSet(CustomLabels)) + pprof.SetGoroutineLabels(ctx) + + localRootSpan, ok := tracer.SpanFromContext(ctx) + // We run our handler in a reqSpan so we can test that we still include the + // correct "local root span id" in the profiler labels. + reqSpan, reqSpanCtx := tracer.StartSpanFromContext(ctx, "workHandler") + defer reqSpan.Finish() + if !ok { + // when app type is Direct, reqSpan is our local root span + localRootSpan = reqSpan + } + + // fakeSQLQuery pretends to execute an APM instrumented SQL query. This tests + // that the parent goroutine labels are correctly restored when it finishes. + fakeSQLQuery(reqSpanCtx, "SELECT * FROM foo", time.Duration(req.SqlDuration)) + + // cpuSpan is the one that should show up in our profile while cpuHog is + // running. decoySpan is a child span that finishes before the cpuHog work + // begins to test that span's restore their parent span labels when + // finishing. + var cpuSpan, decoySpan ddtrace.Span + if a.config.ChildOf { + cpuSpan = tracer.StartSpan("cpuHog", tracer.ChildOf(reqSpan.Context())) + decoySpan = tracer.StartSpan("decoy", tracer.ChildOf(cpuSpan.Context())) + } else { + var ctx context.Context + cpuSpan, ctx = tracer.StartSpanFromContext(reqSpanCtx, "cpuHog") + decoySpan, _ = tracer.StartSpanFromContext(ctx, "decoy") + } + decoySpan.Finish() + + // Perform CPU intense work on another goroutine. This should still be + // tracked to the childSpan thanks to goroutines inheriting labels. + stop := make(chan struct{}) + go cpuHogUntil(stop) + if req.CpuDuration > 0 { + time.Sleep(time.Duration(req.CpuDuration)) + } + close(stop) + cpuSpan.Finish() + + return &pb.WorkRes{ + LocalRootSpanId: fmt.Sprintf("%d", localRootSpan.Context().SpanID()), + SpanId: fmt.Sprintf("%d", cpuSpan.Context().SpanID()), + }, nil +} + +func toLabelSet(m map[string]string) pprof.LabelSet { + var args []string + for k, v := range m { + args = append(args, k, v) + } + return pprof.Labels(args...) +} + +func fakeSQLQuery(ctx context.Context, sql string, d time.Duration) { + span, _ := tracer.StartSpanFromContext(ctx, "pgx.query") + defer span.Finish() + span.SetTag(ext.ResourceName, sql) + time.Sleep(d) +} + +func cpuHogUntil(stop chan struct{}) { + for i := 0; ; i++ { + select { + case <-stop: + return + default: + // burn cpu + fmt.Fprintf(ioutil.Discard, "%d", i) + } + } +} diff --git a/internal/traceprof/traceproftest/baseline.txt b/internal/traceprof/traceproftest/baseline.txt new file mode 100644 index 0000000000..5e0f2ba727 --- /dev/null +++ b/internal/traceprof/traceproftest/baseline.txt @@ -0,0 +1,51 @@ +goos: linux +goarch: amd64 +pkg: gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/traceproftest +cpu: Intel(R) Xeon(R) Platinum 8275CL CPU @ 3.00GHz +BenchmarkEndpointsAndHotspots/direct/hello-world-4 4921081 15785 ns/op 15211 cpu-ns/op 0.01048 pprof-B/op 0.0008820 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 4892020 15767 ns/op 15132 cpu-ns/op 0.01063 pprof-B/op 0.0008847 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 4952154 15927 ns/op 15323 cpu-ns/op 0.01060 pprof-B/op 0.0008896 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 4965084 15699 ns/op 15149 cpu-ns/op 0.01059 pprof-B/op 0.0008939 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 4904989 15752 ns/op 15159 cpu-ns/op 0.01072 pprof-B/op 0.0008873 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 668 105784413 ns/op 95501038 cpu-ns/op 39.92 pprof-B/op 7.385 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 669 105511102 ns/op 95269102 cpu-ns/op 40.84 pprof-B/op 7.488 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 668 105763000 ns/op 95486729 cpu-ns/op 40.04 pprof-B/op 7.475 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 670 105647129 ns/op 95398376 cpu-ns/op 40.30 pprof-B/op 7.469 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 669 105676301 ns/op 95442234 cpu-ns/op 40.31 pprof-B/op 7.528 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 660 108170816 ns/op 17895706 cpu-ns/op 12.30 pprof-B/op 1.411 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 663 108555643 ns/op 18282702 cpu-ns/op 12.25 pprof-B/op 1.410 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 662 108640373 ns/op 18400929 cpu-ns/op 12.50 pprof-B/op 1.424 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 662 108709161 ns/op 18387981 cpu-ns/op 12.40 pprof-B/op 1.439 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 663 108537476 ns/op 18182355 cpu-ns/op 11.98 pprof-B/op 1.419 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 802096 90957 ns/op 89848 cpu-ns/op 0.09243 pprof-B/op 0.005170 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 816661 91077 ns/op 89966 cpu-ns/op 0.09163 pprof-B/op 0.005147 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 805726 91305 ns/op 90170 cpu-ns/op 0.09191 pprof-B/op 0.005182 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 804795 91338 ns/op 90216 cpu-ns/op 0.09206 pprof-B/op 0.005130 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 810649 90982 ns/op 89901 cpu-ns/op 0.09210 pprof-B/op 0.005101 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 662 106231692 ns/op 95926358 cpu-ns/op 43.62 pprof-B/op 7.565 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 662 106496325 ns/op 96186578 cpu-ns/op 43.37 pprof-B/op 7.548 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 666 106091798 ns/op 95819793 cpu-ns/op 43.04 pprof-B/op 7.466 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 672 106021833 ns/op 95720101 cpu-ns/op 42.70 pprof-B/op 7.506 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 666 106408102 ns/op 96104234 cpu-ns/op 43.23 pprof-B/op 7.538 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 666 108952881 ns/op 18584796 cpu-ns/op 15.14 pprof-B/op 1.398 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 663 108989035 ns/op 18578857 cpu-ns/op 14.84 pprof-B/op 1.466 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 666 108930125 ns/op 18509011 cpu-ns/op 14.89 pprof-B/op 1.429 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 662 109004308 ns/op 18616398 cpu-ns/op 13.92 pprof-B/op 1.424 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 664 108848134 ns/op 18456902 cpu-ns/op 14.25 pprof-B/op 1.415 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 579132 124004 ns/op 93700 cpu-ns/op 0.08870 pprof-B/op 0.004400 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 576729 123850 ns/op 93492 cpu-ns/op 0.08960 pprof-B/op 0.004363 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 575628 124019 ns/op 93802 cpu-ns/op 0.08961 pprof-B/op 0.004413 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 566127 124432 ns/op 93980 cpu-ns/op 0.08884 pprof-B/op 0.004376 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 580369 124002 ns/op 93632 cpu-ns/op 0.08970 pprof-B/op 0.004423 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 663 105846471 ns/op 95631702 cpu-ns/op 42.92 pprof-B/op 7.497 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 673 106236784 ns/op 95994293 cpu-ns/op 42.65 pprof-B/op 7.607 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 662 105743029 ns/op 95509385 cpu-ns/op 42.31 pprof-B/op 7.532 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 668 105871782 ns/op 95643201 cpu-ns/op 42.28 pprof-B/op 7.528 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 664 106567770 ns/op 96303681 cpu-ns/op 42.53 pprof-B/op 7.528 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 661 109036804 ns/op 18570105 cpu-ns/op 14.94 pprof-B/op 1.446 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 662 108958663 ns/op 18495461 cpu-ns/op 14.66 pprof-B/op 1.472 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 660 109053169 ns/op 18575706 cpu-ns/op 14.18 pprof-B/op 1.436 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 656 108724289 ns/op 18297399 cpu-ns/op 13.97 pprof-B/op 1.427 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 660 108614401 ns/op 18179128 cpu-ns/op 15.08 pprof-B/op 1.428 pprof-samples/op +PASS +ok gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/traceproftest 3690.445s diff --git a/internal/traceprof/traceproftest/bench.bash b/internal/traceprof/traceproftest/bench.bash new file mode 100755 index 0000000000..e7473872e6 --- /dev/null +++ b/internal/traceprof/traceproftest/bench.bash @@ -0,0 +1,18 @@ +set -eu +echo "-> Starting benchmark (ETA: 90min)" +BENCH_CMD="go test -benchtime 60s -count 5 -timeout 24h -run ^$ -bench . ." + +if ! which benchstat; then + echo "error: needs benchstat, install via:" + echo "go install golang.org/x/perf/cmd/benchstat@latest" + exit 1 +fi + +$BENCH_CMD | tee baseline.txt + +env \ + BENCH_HOTSPOTS=true \ + BENCH_ENDPOINTS=true \ + $BENCH_CMD | tee endpoints-and-hotspots.txt + +benchstat -sort delta baseline.txt endpoints-and-hotspots.txt | tee overhead.txt diff --git a/internal/traceprof/traceproftest/cpu_profiler.go b/internal/traceprof/traceproftest/cpu_profiler.go new file mode 100644 index 0000000000..148de661c5 --- /dev/null +++ b/internal/traceprof/traceproftest/cpu_profiler.go @@ -0,0 +1,135 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +package traceproftest + +import ( + "bytes" + "os" + "runtime/pprof" + "strings" + "testing" + "time" + + "github.com/google/pprof/profile" + "github.com/stretchr/testify/require" +) + +// StartCPUProfile starts a new CPU profile. +func StartCPUProfile(t testing.TB) *CPUProfiler { + cp := &CPUProfiler{} + cp.start(t) + return cp +} + +// CPUProfiler is a simplified implementation of the CPU profiler found in pkg +// profiler that retains essential performance characteristics but is more +// convenient for testing. +// +// TODO(fg) Would be nice to figure out a clean way to use the actual profiler +// pkg for this in the future. +type CPUProfiler struct { + buf bytes.Buffer + prof *CPUProfile + stopped bool +} + +// Stop stops the CPU profiler and returns the CPU profile. +func (c *CPUProfiler) Stop(t testing.TB) *CPUProfile { + if c.stopped { + return c.prof + } + c.stopped = true + pprof.StopCPUProfile() + var err error + c.prof, err = NewCPUProfile(c.buf.Bytes()) + require.NoError(t, err) + return c.prof +} + +// start starts the cpu profiler. +func (c *CPUProfiler) start(t testing.TB) { + require.NoError(t, pprof.StartCPUProfile(&c.buf)) +} + +// NewCPUProfile returns a new CPU profile for the given data. +func NewCPUProfile(data []byte) (*CPUProfile, error) { + cp := &CPUProfile{data: data} + prof, err := profile.ParseData(data) + cp.prof = prof + return cp, err +} + +// CPUProfile is a test utility to extract data from a CPU profile for testing. +type CPUProfile struct { + data []byte + prof *profile.Profile +} + +// Duration returns the total amont of CPU time in this profile. +func (c *CPUProfile) Duration() (d time.Duration) { + return c.LabelsDuration(nil) +} + +// LabelDuration returns the CPU time for the given pprof label in this +// profile. The special val "*" can be used to match any label value. +func (c *CPUProfile) LabelDuration(label, val string) (d time.Duration) { + return c.LabelsDuration(map[string]string{label: val}) +} + +// LabelsDuration returns the CPU time for the given pprof labels in this +// profile. The special val "*" can be used to match any label value. +func (c *CPUProfile) LabelsDuration(labels map[string]string) (d time.Duration) { +sampleloop: + for _, s := range c.prof.Sample { + for k, v := range labels { + if vals := s.Label[k]; len(vals) != 1 || (vals[0] != v && v != "*") { + continue sampleloop + } + } + d += time.Duration(s.Value[1]) + } + return d +} + +// Samples returns the number of samples in the CPU profile. +func (c *CPUProfile) Samples() int { + return len(c.prof.Sample) +} + +// Size returns the size of the pprof encoded CPU profile in bytes. +func (c *CPUProfile) Size() int { + return len(c.data) +} + +// Labels returns the number of samples per individual label in this profile. +func (c *CPUProfile) Labels() map[Label]int { + m := map[Label]int{} + for _, s := range c.prof.Sample { + for k, v := range s.Label { + val := strings.Join(v, ",") + lbl := Label{Key: k, Val: val} + m[lbl]++ + } + } + return m +} + +// WriteFile writes the profile to the given path. +func (c *CPUProfile) WriteFile(path string) error { + file, err := os.Create(path) + if err != nil { + return err + } + defer file.Close() + return c.prof.Write(file) +} + +// Label represents a simplified pprof label where the value is a +// comma-separated string rather than a []string. +type Label struct { + Key string + Val string +} diff --git a/internal/traceprof/traceproftest/endpoints-and-hotspots.txt b/internal/traceprof/traceproftest/endpoints-and-hotspots.txt new file mode 100644 index 0000000000..3eaf97a323 --- /dev/null +++ b/internal/traceprof/traceproftest/endpoints-and-hotspots.txt @@ -0,0 +1,51 @@ +goos: linux +goarch: amd64 +pkg: gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/traceproftest +cpu: Intel(R) Xeon(R) Platinum 8275CL CPU @ 3.00GHz +BenchmarkEndpointsAndHotspots/direct/hello-world-4 3739210 20677 ns/op 20019 cpu-ns/op 0.03021 pprof-B/op 0.001218 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 3623416 20979 ns/op 20130 cpu-ns/op 0.02966 pprof-B/op 0.001207 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 3631983 20825 ns/op 19961 cpu-ns/op 0.02999 pprof-B/op 0.001205 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 3636554 20747 ns/op 20001 cpu-ns/op 0.02942 pprof-B/op 0.001191 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/hello-world-4 3694231 20601 ns/op 19881 cpu-ns/op 0.02950 pprof-B/op 0.001188 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 669 106220497 ns/op 95934055 cpu-ns/op 90.13 pprof-B/op 7.404 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 670 105507300 ns/op 95270978 cpu-ns/op 89.46 pprof-B/op 7.352 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 667 105778813 ns/op 95539941 cpu-ns/op 90.05 pprof-B/op 7.456 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 666 105961107 ns/op 95729228 cpu-ns/op 90.64 pprof-B/op 7.434 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/cpu-bound-4 670 105634368 ns/op 95398926 cpu-ns/op 90.68 pprof-B/op 7.453 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 664 108494230 ns/op 18235434 cpu-ns/op 36.85 pprof-B/op 1.407 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 663 108659659 ns/op 18359072 cpu-ns/op 37.25 pprof-B/op 1.406 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 668 108443400 ns/op 18182275 cpu-ns/op 37.64 pprof-B/op 1.409 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 667 108651265 ns/op 18411639 cpu-ns/op 37.05 pprof-B/op 1.424 pprof-samples/op +BenchmarkEndpointsAndHotspots/direct/io-bound-4 667 108755768 ns/op 18465596 cpu-ns/op 36.66 pprof-B/op 1.428 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 748431 96830 ns/op 95688 cpu-ns/op 0.1325 pprof-B/op 0.004800 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 745563 96831 ns/op 95628 cpu-ns/op 0.1321 pprof-B/op 0.004741 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 754944 96760 ns/op 95613 cpu-ns/op 0.1308 pprof-B/op 0.004718 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 756376 96990 ns/op 95782 cpu-ns/op 0.1304 pprof-B/op 0.004737 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/hello-world-4 749299 96551 ns/op 95391 cpu-ns/op 0.1297 pprof-B/op 0.004694 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 663 105973269 ns/op 95700064 cpu-ns/op 102.4 pprof-B/op 7.402 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 664 106425759 ns/op 96124757 cpu-ns/op 102.1 pprof-B/op 7.412 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 666 106072086 ns/op 95784385 cpu-ns/op 103.3 pprof-B/op 7.487 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 667 106539706 ns/op 96258328 cpu-ns/op 102.5 pprof-B/op 7.460 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/cpu-bound-4 670 106118464 ns/op 95860915 cpu-ns/op 101.9 pprof-B/op 7.390 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 666 109171557 ns/op 18810566 cpu-ns/op 42.36 pprof-B/op 1.447 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 666 108967007 ns/op 18633837 cpu-ns/op 42.86 pprof-B/op 1.409 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 660 109051839 ns/op 18663982 cpu-ns/op 41.78 pprof-B/op 1.416 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 662 109222717 ns/op 18859824 cpu-ns/op 43.55 pprof-B/op 1.442 pprof-samples/op +BenchmarkEndpointsAndHotspots/http/io-bound-4 667 108748793 ns/op 18357311 cpu-ns/op 40.40 pprof-B/op 1.393 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 548379 132653 ns/op 106989 cpu-ns/op 0.1349 pprof-B/op 0.005111 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 549154 132422 ns/op 106848 cpu-ns/op 0.1374 pprof-B/op 0.005132 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 546322 132898 ns/op 107045 cpu-ns/op 0.1343 pprof-B/op 0.005120 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 546170 132744 ns/op 107097 cpu-ns/op 0.1358 pprof-B/op 0.005166 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/hello-world-4 538756 132750 ns/op 107020 cpu-ns/op 0.1351 pprof-B/op 0.005114 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 663 106066994 ns/op 95838461 cpu-ns/op 101.6 pprof-B/op 7.452 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 664 105968183 ns/op 95735758 cpu-ns/op 103.0 pprof-B/op 7.409 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 664 105896722 ns/op 95664111 cpu-ns/op 101.3 pprof-B/op 7.362 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 664 105989749 ns/op 95751738 cpu-ns/op 101.4 pprof-B/op 7.466 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/cpu-bound-4 667 105812325 ns/op 95554936 cpu-ns/op 101.3 pprof-B/op 7.440 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 662 109102443 ns/op 18646395 cpu-ns/op 42.54 pprof-B/op 1.448 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 662 109286255 ns/op 18855193 cpu-ns/op 41.62 pprof-B/op 1.447 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 662 109202894 ns/op 18748116 cpu-ns/op 42.46 pprof-B/op 1.466 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 662 109110671 ns/op 18681041 cpu-ns/op 41.90 pprof-B/op 1.435 pprof-samples/op +BenchmarkEndpointsAndHotspots/grpc/io-bound-4 658 108596149 ns/op 18191310 cpu-ns/op 41.77 pprof-B/op 1.416 pprof-samples/op +PASS +ok gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/traceproftest 3712.759s diff --git a/internal/traceprof/traceproftest/overhead.txt b/internal/traceprof/traceproftest/overhead.txt new file mode 100644 index 0000000000..7c68d216f5 --- /dev/null +++ b/internal/traceprof/traceproftest/overhead.txt @@ -0,0 +1,43 @@ +name old time/op new time/op delta +EndpointsAndHotspots/direct/hello-world-4 15.8µs ± 1% 20.8µs ± 1% +31.55% (p=0.008 n=5+5) +EndpointsAndHotspots/grpc/hello-world-4 124µs ± 0% 133µs ± 0% +6.96% (p=0.008 n=5+5) +EndpointsAndHotspots/http/hello-world-4 91.1µs ± 0% 96.8µs ± 0% +6.21% (p=0.008 n=5+5) +EndpointsAndHotspots/direct/cpu-bound-4 106ms ± 0% 106ms ± 0% ~ (p=0.690 n=5+5) +EndpointsAndHotspots/direct/io-bound-4 109ms ± 0% 109ms ± 0% ~ (p=0.690 n=5+5) +EndpointsAndHotspots/http/cpu-bound-4 106ms ± 0% 106ms ± 0% ~ (p=1.000 n=5+5) +EndpointsAndHotspots/http/io-bound-4 109ms ± 0% 109ms ± 0% ~ (p=0.310 n=5+5) +EndpointsAndHotspots/grpc/cpu-bound-4 106ms ± 0% 106ms ± 0% ~ (p=1.000 n=5+5) +EndpointsAndHotspots/grpc/io-bound-4 109ms ± 0% 109ms ± 0% ~ (p=0.151 n=5+5) + +name old cpu-time/op new cpu-time/op delta +EndpointsAndHotspots/direct/hello-world-4 15.2µs ± 1% 20.0µs ± 1% +31.61% (p=0.008 n=5+5) +EndpointsAndHotspots/grpc/hello-world-4 93.7µs ± 0% 107.0µs ± 0% +14.17% (p=0.008 n=5+5) +EndpointsAndHotspots/http/hello-world-4 90.0µs ± 0% 95.6µs ± 0% +6.22% (p=0.008 n=5+5) +EndpointsAndHotspots/direct/cpu-bound-4 95.4ms ± 0% 95.6ms ± 0% ~ (p=0.310 n=5+5) +EndpointsAndHotspots/direct/io-bound-4 18.2ms ± 2% 18.3ms ± 1% ~ (p=0.548 n=5+5) +EndpointsAndHotspots/http/cpu-bound-4 96.0ms ± 0% 95.9ms ± 0% ~ (p=1.000 n=5+5) +EndpointsAndHotspots/http/io-bound-4 18.5ms ± 0% 18.7ms ± 2% ~ (p=0.151 n=5+5) +EndpointsAndHotspots/grpc/cpu-bound-4 95.8ms ± 1% 95.7ms ± 0% ~ (p=1.000 n=5+5) +EndpointsAndHotspots/grpc/io-bound-4 18.4ms ± 1% 18.6ms ± 2% ~ (p=0.095 n=5+5) + +name old pprof-alloc/op new pprof-alloc/op delta +EndpointsAndHotspots/direct/io-bound-4 12.3B ± 2% 37.1B ± 1% +201.89% (p=0.008 n=5+5) +EndpointsAndHotspots/http/io-bound-4 14.6B ± 5% 42.2B ± 4% +188.81% (p=0.008 n=5+5) +EndpointsAndHotspots/grpc/io-bound-4 14.6B ± 4% 42.1B ± 1% +188.74% (p=0.008 n=5+5) +EndpointsAndHotspots/direct/hello-world-4 0.01B ± 1% 0.03B ± 2% +180.61% (p=0.008 n=5+5) +EndpointsAndHotspots/grpc/cpu-bound-4 42.5B ± 1% 101.7B ± 1% +139.13% (p=0.008 n=5+5) +EndpointsAndHotspots/http/cpu-bound-4 43.2B ± 1% 102.4B ± 1% +137.17% (p=0.008 n=5+5) +EndpointsAndHotspots/direct/cpu-bound-4 40.3B ± 1% 90.2B ± 1% +123.90% (p=0.008 n=5+5) +EndpointsAndHotspots/grpc/hello-world-4 0.09B ± 1% 0.14B ± 1% +51.75% (p=0.008 n=5+5) +EndpointsAndHotspots/http/hello-world-4 0.09B ± 0% 0.13B ± 1% +42.46% (p=0.008 n=5+5) + +name old pprof-samples/op new pprof-samples/op delta +EndpointsAndHotspots/direct/hello-world-4 0.00 ± 1% 0.00 ± 1% +35.41% (p=0.008 n=5+5) +EndpointsAndHotspots/grpc/hello-world-4 0.00 ± 1% 0.01 ± 1% +16.69% (p=0.008 n=5+5) +EndpointsAndHotspots/direct/cpu-bound-4 7.47 ± 1% 7.42 ± 1% ~ (p=0.095 n=5+5) +EndpointsAndHotspots/direct/io-bound-4 1.42 ± 1% 1.41 ± 1% ~ (p=0.333 n=5+5) +EndpointsAndHotspots/http/io-bound-4 1.43 ± 3% 1.42 ± 2% ~ (p=0.841 n=5+5) +EndpointsAndHotspots/grpc/io-bound-4 1.44 ± 2% 1.44 ± 2% ~ (p=0.841 n=5+5) +EndpointsAndHotspots/http/cpu-bound-4 7.52 ± 1% 7.43 ± 1% -1.25% (p=0.016 n=5+5) +EndpointsAndHotspots/grpc/cpu-bound-4 7.54 ± 1% 7.43 ± 1% -1.49% (p=0.008 n=5+5) +EndpointsAndHotspots/http/hello-world-4 0.01 ± 1% 0.00 ± 1% -7.93% (p=0.008 n=5+5) diff --git a/internal/traceprof/traceproftest/traceprof_test.go b/internal/traceprof/traceproftest/traceprof_test.go new file mode 100644 index 0000000000..6f57b17f3f --- /dev/null +++ b/internal/traceprof/traceproftest/traceprof_test.go @@ -0,0 +1,228 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +// traceproftest provides testing for cross-cutting tracer/profiler features. +// It's a separate package from traceprof to avoid circular dependencies. +package traceproftest + +import ( + "os" + "runtime" + "sync" + "testing" + "time" + + "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof" + pb "gopkg.in/DataDog/dd-trace-go.v1/internal/traceprof/testapp" + + "github.com/stretchr/testify/require" +) + +// TestEndpointsAndCodeHotspots tests the code hotspots and endpoint filtering +// feature implemented using pprof labels in the tracer which are picked up by +// the CPU profiler. This is done using a small test application that simulates +// a simple http or grpc workload. +func TestEndpointsAndCodeHotspots(t *testing.T) { + // The amount of time the profiler should be able to detect. It's much lower + // than CpuDuration to avoid flaky test behavior and because we're not trying + // assert the quality of the profiler beyond the presence of the right + // labels. + const minCPUDuration = 10 * time.Millisecond + + // testCommon runs the common parts of this test. + testCommon := func(t *testing.T, c AppConfig) (*pb.WorkRes, *CPUProfile) { + // Simulate a cpu-heavy request with a short sql query. This is a best-case + // scenario for CPU Code Hotspots. + req := &pb.WorkReq{ + CpuDuration: int64(90 * time.Millisecond), + SqlDuration: int64(10 * time.Millisecond), + } + + // Rerun test a few times with doubled duration until it passes to avoid + // flaky behavior in CI. + for attempt := 1; ; attempt++ { + app := c.Start(t) + defer app.Stop(t) + + res := app.WorkRequest(t, req) + prof := app.CPUProfile(t) + + notEnoughSamples := (prof.Duration() < minCPUDuration) || + (prof.LabelsDuration(CustomLabels) < minCPUDuration) || + (c.CodeHotspots && prof.LabelDuration(traceprof.SpanID, "*") < minCPUDuration) || + (c.AppType != Direct && c.Endpoints && prof.LabelDuration(traceprof.TraceEndpoint, "*") < minCPUDuration) + if attempt <= 5 && notEnoughSamples { + req.CpuDuration *= 2 + req.SqlDuration *= 2 + t.Logf("attempt %d: not enough cpu samples, doubling duration", attempt) + continue + } + require.True(t, ValidSpanID(res.SpanId)) + require.True(t, ValidSpanID(res.LocalRootSpanId)) + require.GreaterOrEqual(t, prof.Duration(), minCPUDuration) + require.GreaterOrEqual(t, prof.LabelsDuration(CustomLabels), minCPUDuration) + return res, prof + } + } + + for _, appType := range []testAppType{Direct, HTTP, GRPC} { + t.Run(string(appType), func(t *testing.T) { + t.Run("none", func(t *testing.T) { + res, prof := testCommon(t, AppConfig{ + AppType: appType, + }) + require.Zero(t, prof.LabelDuration(traceprof.SpanID, res.SpanId)) + require.Zero(t, prof.LabelDuration(traceprof.LocalRootSpanID, res.LocalRootSpanId)) + require.Zero(t, prof.LabelDuration(traceprof.TraceEndpoint, appType.Endpoint())) + }) + + t.Run("endpoints", func(t *testing.T) { + res, prof := testCommon(t, AppConfig{ + AppType: appType, + Endpoints: true, + }) + require.Zero(t, prof.LabelDuration(traceprof.SpanID, res.SpanId)) + require.Zero(t, prof.LabelDuration(traceprof.LocalRootSpanID, res.LocalRootSpanId)) + if appType != Direct { + require.GreaterOrEqual(t, prof.LabelDuration(traceprof.TraceEndpoint, appType.Endpoint()), minCPUDuration) + } + }) + + t.Run("code-hotspots", func(t *testing.T) { + res, prof := testCommon(t, AppConfig{ + AppType: appType, + CodeHotspots: true, + }) + require.GreaterOrEqual(t, prof.LabelsDuration(map[string]string{ + traceprof.SpanID: res.SpanId, + traceprof.LocalRootSpanID: res.LocalRootSpanId, + }), minCPUDuration) + require.Zero(t, prof.LabelDuration(traceprof.TraceEndpoint, appType.Endpoint())) + }) + + t.Run("all", func(t *testing.T) { + res, prof := testCommon(t, AppConfig{ + AppType: appType, + CodeHotspots: true, + Endpoints: true, + }) + wantLabels := map[string]string{ + traceprof.SpanID: res.SpanId, + traceprof.LocalRootSpanID: res.LocalRootSpanId, + } + if appType != Direct { + wantLabels[traceprof.TraceEndpoint] = appType.Endpoint() + } + require.GreaterOrEqual(t, prof.LabelsDuration(wantLabels), minCPUDuration) + }) + + t.Run("none-child-of", func(t *testing.T) { + res, prof := testCommon(t, AppConfig{ + AppType: appType, + ChildOf: true, + }) + require.Zero(t, prof.LabelDuration(traceprof.SpanID, res.SpanId)) + require.Zero(t, prof.LabelDuration(traceprof.LocalRootSpanID, res.LocalRootSpanId)) + require.Zero(t, prof.LabelDuration(traceprof.TraceEndpoint, appType.Endpoint())) + }) + + t.Run("all-child-of", func(t *testing.T) { + res, prof := testCommon(t, AppConfig{ + AppType: appType, + CodeHotspots: true, + Endpoints: true, + ChildOf: true, + }) + wantLabels := map[string]string{ + traceprof.SpanID: res.SpanId, + traceprof.LocalRootSpanID: res.LocalRootSpanId, + } + if appType != Direct { + wantLabels[traceprof.TraceEndpoint] = appType.Endpoint() + } + require.GreaterOrEqual(t, prof.LabelsDuration(wantLabels), minCPUDuration) + }) + }) + } +} + +// BenchmarkEndpointsAndHotspots tries to quantify the overhead (latency, pprof +// samples, cpu time, pprof size) of profiler endpoints and code hotspots. Use +// ./bench.sh for executing these benchmarks. +func BenchmarkEndpointsAndHotspots(b *testing.B) { + benchCommon := func(b *testing.B, appType testAppType, req *pb.WorkReq) { + config := AppConfig{ + CodeHotspots: os.Getenv("BENCH_ENDPOINTS") == "true", + Endpoints: os.Getenv("BENCH_HOTSPOTS") == "true", + AppType: appType, + } + app := config.Start(b) + defer app.Stop(b) + + b.ResetTimer() + var ( + wg sync.WaitGroup + concurrency = runtime.GOMAXPROCS(0) + startCPUTime = CPURusage(b) + ) + for g := 0; g < concurrency; g++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < b.N; i++ { + app.WorkRequest(b, req) + } + }() + } + wg.Wait() + cpuTime := CPURusage(b) - startCPUTime + b.StopTimer() + + prof := app.CPUProfile(b) + expectedCPUTime := time.Duration(b.N) * time.Duration(req.CpuDuration) + if expectedCPUTime >= 90*time.Millisecond { + // sanity check profile results if enough samples can be expected + require.Greater(b, prof.Duration(), time.Duration(0)) + if config.CodeHotspots { + require.Greater(b, prof.LabelDuration(traceprof.SpanID, "*"), time.Duration(0)) + require.Greater(b, prof.LabelDuration(traceprof.LocalRootSpanID, "*"), time.Duration(0)) + } + if config.Endpoints && appType != Direct { + require.Greater(b, prof.LabelDuration(traceprof.TraceEndpoint, appType.Endpoint()), time.Duration(0)) + } + } + + b.ReportMetric(float64(prof.Samples())/float64(b.N*concurrency), "pprof-samples/op") + b.ReportMetric(float64(prof.Size())/float64(b.N*concurrency), "pprof-B/op") + b.ReportMetric(float64(cpuTime)/float64(b.N*concurrency), "cpu-ns/op") + } + + for _, appType := range []testAppType{Direct, HTTP, GRPC} { + b.Run(string(appType), func(b *testing.B) { + b.Run("hello-world", func(b *testing.B) { + // Simulates a handler that does no actual work other than paying for + // the instrumentation overhead. + benchCommon(b, appType, &pb.WorkReq{ + CpuDuration: int64(0 * time.Millisecond), + SqlDuration: int64(0 * time.Millisecond), + }) + }) + + b.Run("cpu-bound", func(b *testing.B) { + benchCommon(b, appType, &pb.WorkReq{ + CpuDuration: int64(90 * time.Millisecond), + SqlDuration: int64(10 * time.Millisecond), + }) + }) + + b.Run("io-bound", func(b *testing.B) { + benchCommon(b, appType, &pb.WorkReq{ + CpuDuration: int64(10 * time.Millisecond), + SqlDuration: int64(90 * time.Millisecond), + }) + }) + }) + } +} diff --git a/internal/traceprof/traceproftest/traceproftest.go b/internal/traceprof/traceproftest/traceproftest.go new file mode 100644 index 0000000000..7b4a55c6f5 --- /dev/null +++ b/internal/traceprof/traceproftest/traceproftest.go @@ -0,0 +1,9 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +// Package traceproftest contains test cases and test helpers for testing +// cross-cutting tracer/profiler features such as code hotspots and endpoints. +// The package is not supposed to be imported by other packages. +package traceproftest diff --git a/internal/traceprof/traceproftest/util.go b/internal/traceprof/traceproftest/util.go new file mode 100644 index 0000000000..727b1b7f18 --- /dev/null +++ b/internal/traceprof/traceproftest/util.go @@ -0,0 +1,37 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2021 Datadog, Inc. + +package traceproftest + +import ( + "strconv" + "syscall" + "testing" + "time" +) + +// ValidSpanID returns true if id is a valid span id (random.Uint64()). +func ValidSpanID(id string) bool { + val, err := strconv.ParseUint(id, 10, 64) + return err == nil && val > 0 +} + +// CPURusage returns the amount of On-CPU time for the process (sys+user) since +// it has been started. It uses getrusage(2). +func CPURusage(t testing.TB) time.Duration { + // Note: If this becomes a portability issue in the future (windows?) it's okay + // to implement a non-working version for those platforms. We only use this for + // benchmarking on linux right now. + var rusage syscall.Rusage + if err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage); err != nil { + t.Fatal(err) + panic(err) + } + return timevalDuration(rusage.Stime) + timevalDuration(rusage.Utime) +} + +func timevalDuration(tv syscall.Timeval) time.Duration { + return time.Duration(tv.Nano()) +} diff --git a/profiler/profile_test.go b/profiler/profile_test.go index fc599c7a67..c8978f7975 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -186,6 +186,7 @@ main;bar 0 0 8 16 stopCPUProfile = func() {} p, err := unstartedProfiler(CPUDuration(10 * time.Millisecond)) + require.NoError(t, err) start := time.Now() profs, err := p.runProfile(CPUProfile) end := time.Now() @@ -203,6 +204,7 @@ main;bar 0 0 8 16 } p, err := unstartedProfiler() + require.NoError(t, err) profs, err := p.runProfile(GoroutineProfile) require.NoError(t, err) assert.Equal(t, "goroutines.pprof", profs[0].name) @@ -233,12 +235,13 @@ main.main() ` defer func(old func(_ string, _ io.Writer, _ int) error) { lookupProfile = old }(lookupProfile) - lookupProfile = func(name string, w io.Writer, _ int) error { + lookupProfile = func(_ string, w io.Writer, _ int) error { _, err := w.Write([]byte(sample)) return err } p, err := unstartedProfiler() + require.NoError(t, err) profs, err := p.runProfile(expGoroutineWaitProfile) require.NoError(t, err) require.Equal(t, "goroutineswait.pprof", profs[0].name)