From 5f64165c8c719bdab64e63db3e4d1735f341f610 Mon Sep 17 00:00:00 2001 From: Gabriel Aszalos Date: Mon, 14 Jan 2019 14:08:41 -0500 Subject: [PATCH] Add datadog-trace-agent code. github.com/DataDog/datadog-trace-agent@e57142cafa35ccd811c40a648a44c942cb99ae29 --- cmd/trace-agent/agent.go | 352 +++++++ cmd/trace-agent/agent_test.go | 589 +++++++++++ cmd/trace-agent/concentrator.go | 176 ++++ cmd/trace-agent/concentrator_test.go | 421 ++++++++ cmd/trace-agent/log.go | 225 +++++ cmd/trace-agent/main.go | 179 ++++ cmd/trace-agent/main_nix.go | 23 + cmd/trace-agent/main_windows.go | 294 ++++++ cmd/trace-agent/model_test.go | 33 + cmd/trace-agent/sampler.go | 120 +++ cmd/trace-agent/service_mapper.go | 114 +++ cmd/trace-agent/service_mapper_test.go | 72 ++ cmd/trace-agent/sublayers.go | 91 ++ cmd/trace-agent/sublayers_test.go | 99 ++ cmd/trace-agent/trace_service_extractor.go | 42 + .../trace_service_extractor_test.go | 40 + cmd/trace-agent/windows_resources/project.ico | Bin 0 -> 204862 bytes .../windows_resources/project_16x16.ico | Bin 0 -> 1406 bytes .../windows_resources/project_32x32.ico | Bin 0 -> 3262 bytes .../windows_resources/trace-agent-msg.mc | 85 ++ .../windows_resources/trace-agent.rc | 54 ++ cmd/trace-agent/windows_resources/version.h | 7 + pkg/trace/agent/normalizer.go | 262 +++++ pkg/trace/agent/normalizer_test.go | 393 ++++++++ pkg/trace/agent/processed_trace.go | 26 + pkg/trace/agent/stats.go | 157 +++ pkg/trace/agent/stats_payload.go | 29 + pkg/trace/agent/stats_test.go | 639 ++++++++++++ pkg/trace/agent/statsraw.go | 280 ++++++ pkg/trace/agent/statsraw_test.go | 30 + pkg/trace/agent/sublayers.go | 268 +++++ pkg/trace/agent/sublayers_test.go | 445 +++++++++ pkg/trace/agent/tags.go | 306 ++++++ pkg/trace/agent/tags_test.go | 147 +++ pkg/trace/agent/truncator.go | 60 ++ pkg/trace/agent/truncator_test.go | 66 ++ pkg/trace/agent/weighted_span.go | 34 + pkg/trace/api/api.go | 425 ++++++++ pkg/trace/api/api_test.go | 718 ++++++++++++++ pkg/trace/api/limited_reader.go | 57 ++ pkg/trace/api/limited_reader_test.go | 105 ++ pkg/trace/api/listener.go | 80 ++ pkg/trace/api/logger.go | 38 + pkg/trace/api/responses.go | 74 ++ pkg/trace/config/README.md | 28 + pkg/trace/config/apply.go | 468 +++++++++ pkg/trace/config/apply_test.go | 24 + pkg/trace/config/config.go | 250 +++++ pkg/trace/config/config_nix.go | 20 + pkg/trace/config/config_test.go | 408 ++++++++ pkg/trace/config/config_windows.go | 37 + pkg/trace/config/env.go | 96 ++ pkg/trace/config/env_test.go | 260 +++++ pkg/trace/config/testdata/full.ini | 85 ++ pkg/trace/config/testdata/full.yaml | 63 ++ pkg/trace/config/testdata/multi_api_keys.ini | 3 + pkg/trace/config/testdata/no_apm_config.ini | 6 + pkg/trace/config/testdata/site_default.yaml | 2 + pkg/trace/config/testdata/site_eu.yaml | 3 + pkg/trace/config/testdata/site_override.yaml | 5 + pkg/trace/config/testdata/site_url.yaml | 4 + pkg/trace/config/testdata/undocumented.ini | 11 + pkg/trace/config/testdata/undocumented.yaml | 41 + pkg/trace/event/doc.go | 16 + pkg/trace/event/extractor.go | 14 + pkg/trace/event/extractor_fixed_rate.go | 40 + pkg/trace/event/extractor_fixed_rate_test.go | 52 + pkg/trace/event/extractor_legacy.go | 35 + pkg/trace/event/extractor_metric.go | 38 + pkg/trace/event/extractor_metric_test.go | 44 + pkg/trace/event/extractor_noop.go | 18 + pkg/trace/event/extractor_test.go | 36 + pkg/trace/event/processor.go | 120 +++ pkg/trace/event/processor_test.go | 136 +++ pkg/trace/event/sampler_max_eps.go | 149 +++ pkg/trace/event/sampler_max_eps_test.go | 74 ++ pkg/trace/filters/blacklister.go | 44 + pkg/trace/filters/blacklister_test.go | 48 + pkg/trace/filters/replacer.go | 43 + pkg/trace/filters/replacer_test.go | 130 +++ pkg/trace/flags/flags.go | 51 + pkg/trace/flags/flags_nix.go | 8 + pkg/trace/flags/flags_windows.go | 25 + pkg/trace/info/endpoint.go | 27 + pkg/trace/info/git_version.go | 17 + pkg/trace/info/info.go | 424 ++++++++ pkg/trace/info/info_test.go | 355 +++++++ pkg/trace/info/make.go | 63 ++ pkg/trace/info/sampler.go | 19 + pkg/trace/info/stats.go | 284 ++++++ pkg/trace/info/testdata/okay.info | 28 + pkg/trace/info/testdata/okay.json | 14 + pkg/trace/info/testdata/warning.info | 33 + pkg/trace/info/testdata/warning.json | 13 + pkg/trace/info/version.go | 58 ++ pkg/trace/info/writer.go | 71 ++ pkg/trace/metrics/metrics.go | 56 ++ pkg/trace/obfuscate/http.go | 56 ++ pkg/trace/obfuscate/http_test.go | 159 +++ pkg/trace/obfuscate/json.go | 150 +++ pkg/trace/obfuscate/json_scanner.go | 581 +++++++++++ pkg/trace/obfuscate/json_test.go | 119 +++ pkg/trace/obfuscate/memcached.go | 21 + pkg/trace/obfuscate/memcached_test.go | 43 + pkg/trace/obfuscate/obfuscate.go | 92 ++ pkg/trace/obfuscate/obfuscate_test.go | 233 +++++ pkg/trace/obfuscate/redis.go | 257 +++++ pkg/trace/obfuscate/redis_test.go | 381 ++++++++ pkg/trace/obfuscate/redis_tokenizer.go | 182 ++++ pkg/trace/obfuscate/redis_tokenizer_test.go | 231 +++++ pkg/trace/obfuscate/sql.go | 233 +++++ pkg/trace/obfuscate/sql_test.go | 488 ++++++++++ pkg/trace/obfuscate/sql_tokenizer.go | 501 ++++++++++ pkg/trace/obfuscate/testdata/json_tests.xml | 490 ++++++++++ pkg/trace/osutil/file.go | 38 + pkg/trace/pb/decoder.go | 195 ++++ pkg/trace/pb/decoder_test.go | 41 + pkg/trace/pb/doc.go | 7 + pkg/trace/pb/services.go | 4 + pkg/trace/pb/services_gen.go | 107 ++ pkg/trace/pb/services_gen_test.go | 67 ++ pkg/trace/pb/span.pb.go | 917 ++++++++++++++++++ pkg/trace/pb/span.proto | 20 + pkg/trace/pb/span_gen.go | 350 +++++++ pkg/trace/pb/trace.go | 7 + pkg/trace/pb/trace.pb.go | 404 ++++++++ pkg/trace/pb/trace.proto | 12 + pkg/trace/pb/trace_gen.go | 162 ++++ pkg/trace/pb/trace_gen_test.go | 122 +++ pkg/trace/pb/trace_payload.pb.go | 458 +++++++++ pkg/trace/pb/trace_payload.proto | 13 + pkg/trace/quantile/README.md | 14 + pkg/trace/quantile/summary.go | 239 +++++ pkg/trace/quantile/summary_bench_test.go | 104 ++ pkg/trace/quantile/summary_test.go | 195 ++++ pkg/trace/quantile/weighted.go | 84 ++ pkg/trace/quantile/weighted_test.go | 93 ++ pkg/trace/sampler/adjust.go | 67 ++ pkg/trace/sampler/adjust_test.go | 32 + pkg/trace/sampler/backend.go | 34 + pkg/trace/sampler/catalog.go | 44 + pkg/trace/sampler/catalog_test.go | 85 ++ pkg/trace/sampler/coresampler.go | 163 ++++ pkg/trace/sampler/coresampler_test.go | 84 ++ pkg/trace/sampler/dynamic_config.go | 70 ++ pkg/trace/sampler/dynamic_config_test.go | 198 ++++ pkg/trace/sampler/float64.go | 49 + pkg/trace/sampler/memory_backend.go | 163 ++++ pkg/trace/sampler/memory_backend_test.go | 92 ++ pkg/trace/sampler/presampler.go | 241 +++++ pkg/trace/sampler/presampler_test.go | 171 ++++ pkg/trace/sampler/prioritysampler.go | 146 +++ pkg/trace/sampler/prioritysampler_test.go | 218 +++++ pkg/trace/sampler/sampler.go | 189 ++++ pkg/trace/sampler/sampler_test.go | 65 ++ pkg/trace/sampler/score.go | 79 ++ pkg/trace/sampler/score_test.go | 44 + pkg/trace/sampler/scoresampler.go | 92 ++ pkg/trace/sampler/scoresampler_test.go | 124 +++ pkg/trace/sampler/signature.go | 88 ++ pkg/trace/sampler/signature_test.go | 128 +++ pkg/trace/sampler/state.go | 23 + pkg/trace/test/agent.go | 182 ++++ pkg/trace/test/backend.go | 152 +++ pkg/trace/test/buffer.go | 56 ++ pkg/trace/test/buffer_test.go | 33 + pkg/trace/test/doc.go | 4 + pkg/trace/test/example_test.go | 48 + pkg/trace/test/runner.go | 122 +++ pkg/trace/test/testsuite/hostname_test.go | 100 ++ pkg/trace/test/testutil/backoff.go | 59 ++ pkg/trace/test/testutil/random.go | 31 + pkg/trace/test/testutil/sampler.go | 42 + pkg/trace/test/testutil/services.go | 29 + pkg/trace/test/testutil/span.go | 329 +++++++ pkg/trace/test/testutil/span_test.go | 25 + pkg/trace/test/testutil/stats.go | 168 ++++ pkg/trace/test/testutil/stats_test.go | 19 + pkg/trace/test/testutil/statsd.go | 140 +++ pkg/trace/test/testutil/testutil.go | 11 + pkg/trace/test/testutil/trace.go | 112 +++ pkg/trace/traceutil/doc.go | 3 + pkg/trace/traceutil/span.go | 44 + pkg/trace/traceutil/span_test.go | 168 ++++ pkg/trace/traceutil/trace.go | 133 +++ pkg/trace/traceutil/trace_test.go | 56 ++ pkg/trace/watchdog/info.go | 179 ++++ pkg/trace/watchdog/info_test.go | 246 +++++ pkg/trace/watchdog/logonpanic.go | 43 + pkg/trace/watchdog/logonpanic_test.go | 100 ++ pkg/trace/watchdog/net.go | 33 + pkg/trace/watchdog/net_windows.go | 6 + pkg/trace/writer/backoff/backoff.go | 84 ++ pkg/trace/writer/backoff/backoff_test.go | 117 +++ pkg/trace/writer/backoff/exponential.go | 79 ++ pkg/trace/writer/backoff/exponential_test.go | 92 ++ pkg/trace/writer/config/payload.go | 25 + pkg/trace/writer/config/service_writer.go | 21 + pkg/trace/writer/config/stats_writer.go | 26 + pkg/trace/writer/config/trace_writer.go | 21 + pkg/trace/writer/endpoint.go | 175 ++++ pkg/trace/writer/endpoint_test.go | 146 +++ pkg/trace/writer/fixtures_test.go | 218 +++++ pkg/trace/writer/multi.go | 77 ++ pkg/trace/writer/multi_test.go | 203 ++++ pkg/trace/writer/payload.go | 380 ++++++++ pkg/trace/writer/payload_test.go | 540 +++++++++++ pkg/trace/writer/service.go | 176 ++++ pkg/trace/writer/service_test.go | 211 ++++ pkg/trace/writer/stats.go | 306 ++++++ pkg/trace/writer/stats_test.go | 405 ++++++++ pkg/trace/writer/trace.go | 296 ++++++ pkg/trace/writer/trace_test.go | 376 +++++++ 213 files changed, 29258 insertions(+) create mode 100644 cmd/trace-agent/agent.go create mode 100644 cmd/trace-agent/agent_test.go create mode 100644 cmd/trace-agent/concentrator.go create mode 100644 cmd/trace-agent/concentrator_test.go create mode 100644 cmd/trace-agent/log.go create mode 100644 cmd/trace-agent/main.go create mode 100644 cmd/trace-agent/main_nix.go create mode 100644 cmd/trace-agent/main_windows.go create mode 100644 cmd/trace-agent/model_test.go create mode 100644 cmd/trace-agent/sampler.go create mode 100644 cmd/trace-agent/service_mapper.go create mode 100644 cmd/trace-agent/service_mapper_test.go create mode 100644 cmd/trace-agent/sublayers.go create mode 100644 cmd/trace-agent/sublayers_test.go create mode 100644 cmd/trace-agent/trace_service_extractor.go create mode 100644 cmd/trace-agent/trace_service_extractor_test.go create mode 100644 cmd/trace-agent/windows_resources/project.ico create mode 100644 cmd/trace-agent/windows_resources/project_16x16.ico create mode 100644 cmd/trace-agent/windows_resources/project_32x32.ico create mode 100644 cmd/trace-agent/windows_resources/trace-agent-msg.mc create mode 100644 cmd/trace-agent/windows_resources/trace-agent.rc create mode 100644 cmd/trace-agent/windows_resources/version.h create mode 100644 pkg/trace/agent/normalizer.go create mode 100644 pkg/trace/agent/normalizer_test.go create mode 100644 pkg/trace/agent/processed_trace.go create mode 100644 pkg/trace/agent/stats.go create mode 100644 pkg/trace/agent/stats_payload.go create mode 100644 pkg/trace/agent/stats_test.go create mode 100644 pkg/trace/agent/statsraw.go create mode 100644 pkg/trace/agent/statsraw_test.go create mode 100644 pkg/trace/agent/sublayers.go create mode 100644 pkg/trace/agent/sublayers_test.go create mode 100644 pkg/trace/agent/tags.go create mode 100644 pkg/trace/agent/tags_test.go create mode 100644 pkg/trace/agent/truncator.go create mode 100644 pkg/trace/agent/truncator_test.go create mode 100644 pkg/trace/agent/weighted_span.go create mode 100644 pkg/trace/api/api.go create mode 100644 pkg/trace/api/api_test.go create mode 100644 pkg/trace/api/limited_reader.go create mode 100644 pkg/trace/api/limited_reader_test.go create mode 100644 pkg/trace/api/listener.go create mode 100644 pkg/trace/api/logger.go create mode 100644 pkg/trace/api/responses.go create mode 100644 pkg/trace/config/README.md create mode 100644 pkg/trace/config/apply.go create mode 100644 pkg/trace/config/apply_test.go create mode 100644 pkg/trace/config/config.go create mode 100644 pkg/trace/config/config_nix.go create mode 100644 pkg/trace/config/config_test.go create mode 100644 pkg/trace/config/config_windows.go create mode 100644 pkg/trace/config/env.go create mode 100644 pkg/trace/config/env_test.go create mode 100644 pkg/trace/config/testdata/full.ini create mode 100644 pkg/trace/config/testdata/full.yaml create mode 100644 pkg/trace/config/testdata/multi_api_keys.ini create mode 100644 pkg/trace/config/testdata/no_apm_config.ini create mode 100644 pkg/trace/config/testdata/site_default.yaml create mode 100644 pkg/trace/config/testdata/site_eu.yaml create mode 100644 pkg/trace/config/testdata/site_override.yaml create mode 100644 pkg/trace/config/testdata/site_url.yaml create mode 100644 pkg/trace/config/testdata/undocumented.ini create mode 100644 pkg/trace/config/testdata/undocumented.yaml create mode 100644 pkg/trace/event/doc.go create mode 100644 pkg/trace/event/extractor.go create mode 100644 pkg/trace/event/extractor_fixed_rate.go create mode 100644 pkg/trace/event/extractor_fixed_rate_test.go create mode 100644 pkg/trace/event/extractor_legacy.go create mode 100644 pkg/trace/event/extractor_metric.go create mode 100644 pkg/trace/event/extractor_metric_test.go create mode 100644 pkg/trace/event/extractor_noop.go create mode 100644 pkg/trace/event/extractor_test.go create mode 100644 pkg/trace/event/processor.go create mode 100644 pkg/trace/event/processor_test.go create mode 100644 pkg/trace/event/sampler_max_eps.go create mode 100644 pkg/trace/event/sampler_max_eps_test.go create mode 100644 pkg/trace/filters/blacklister.go create mode 100644 pkg/trace/filters/blacklister_test.go create mode 100644 pkg/trace/filters/replacer.go create mode 100644 pkg/trace/filters/replacer_test.go create mode 100644 pkg/trace/flags/flags.go create mode 100644 pkg/trace/flags/flags_nix.go create mode 100644 pkg/trace/flags/flags_windows.go create mode 100644 pkg/trace/info/endpoint.go create mode 100644 pkg/trace/info/git_version.go create mode 100644 pkg/trace/info/info.go create mode 100644 pkg/trace/info/info_test.go create mode 100644 pkg/trace/info/make.go create mode 100644 pkg/trace/info/sampler.go create mode 100644 pkg/trace/info/stats.go create mode 100644 pkg/trace/info/testdata/okay.info create mode 100644 pkg/trace/info/testdata/okay.json create mode 100644 pkg/trace/info/testdata/warning.info create mode 100644 pkg/trace/info/testdata/warning.json create mode 100644 pkg/trace/info/version.go create mode 100644 pkg/trace/info/writer.go create mode 100644 pkg/trace/metrics/metrics.go create mode 100644 pkg/trace/obfuscate/http.go create mode 100644 pkg/trace/obfuscate/http_test.go create mode 100644 pkg/trace/obfuscate/json.go create mode 100644 pkg/trace/obfuscate/json_scanner.go create mode 100644 pkg/trace/obfuscate/json_test.go create mode 100644 pkg/trace/obfuscate/memcached.go create mode 100644 pkg/trace/obfuscate/memcached_test.go create mode 100644 pkg/trace/obfuscate/obfuscate.go create mode 100644 pkg/trace/obfuscate/obfuscate_test.go create mode 100644 pkg/trace/obfuscate/redis.go create mode 100644 pkg/trace/obfuscate/redis_test.go create mode 100644 pkg/trace/obfuscate/redis_tokenizer.go create mode 100644 pkg/trace/obfuscate/redis_tokenizer_test.go create mode 100644 pkg/trace/obfuscate/sql.go create mode 100644 pkg/trace/obfuscate/sql_test.go create mode 100644 pkg/trace/obfuscate/sql_tokenizer.go create mode 100644 pkg/trace/obfuscate/testdata/json_tests.xml create mode 100644 pkg/trace/osutil/file.go create mode 100644 pkg/trace/pb/decoder.go create mode 100644 pkg/trace/pb/decoder_test.go create mode 100644 pkg/trace/pb/doc.go create mode 100644 pkg/trace/pb/services.go create mode 100644 pkg/trace/pb/services_gen.go create mode 100644 pkg/trace/pb/services_gen_test.go create mode 100644 pkg/trace/pb/span.pb.go create mode 100644 pkg/trace/pb/span.proto create mode 100644 pkg/trace/pb/span_gen.go create mode 100644 pkg/trace/pb/trace.go create mode 100644 pkg/trace/pb/trace.pb.go create mode 100644 pkg/trace/pb/trace.proto create mode 100644 pkg/trace/pb/trace_gen.go create mode 100644 pkg/trace/pb/trace_gen_test.go create mode 100644 pkg/trace/pb/trace_payload.pb.go create mode 100644 pkg/trace/pb/trace_payload.proto create mode 100644 pkg/trace/quantile/README.md create mode 100644 pkg/trace/quantile/summary.go create mode 100644 pkg/trace/quantile/summary_bench_test.go create mode 100644 pkg/trace/quantile/summary_test.go create mode 100644 pkg/trace/quantile/weighted.go create mode 100644 pkg/trace/quantile/weighted_test.go create mode 100644 pkg/trace/sampler/adjust.go create mode 100644 pkg/trace/sampler/adjust_test.go create mode 100644 pkg/trace/sampler/backend.go create mode 100644 pkg/trace/sampler/catalog.go create mode 100644 pkg/trace/sampler/catalog_test.go create mode 100644 pkg/trace/sampler/coresampler.go create mode 100644 pkg/trace/sampler/coresampler_test.go create mode 100644 pkg/trace/sampler/dynamic_config.go create mode 100644 pkg/trace/sampler/dynamic_config_test.go create mode 100644 pkg/trace/sampler/float64.go create mode 100644 pkg/trace/sampler/memory_backend.go create mode 100644 pkg/trace/sampler/memory_backend_test.go create mode 100644 pkg/trace/sampler/presampler.go create mode 100644 pkg/trace/sampler/presampler_test.go create mode 100644 pkg/trace/sampler/prioritysampler.go create mode 100644 pkg/trace/sampler/prioritysampler_test.go create mode 100644 pkg/trace/sampler/sampler.go create mode 100644 pkg/trace/sampler/sampler_test.go create mode 100644 pkg/trace/sampler/score.go create mode 100644 pkg/trace/sampler/score_test.go create mode 100644 pkg/trace/sampler/scoresampler.go create mode 100644 pkg/trace/sampler/scoresampler_test.go create mode 100644 pkg/trace/sampler/signature.go create mode 100644 pkg/trace/sampler/signature_test.go create mode 100644 pkg/trace/sampler/state.go create mode 100644 pkg/trace/test/agent.go create mode 100644 pkg/trace/test/backend.go create mode 100644 pkg/trace/test/buffer.go create mode 100644 pkg/trace/test/buffer_test.go create mode 100644 pkg/trace/test/doc.go create mode 100644 pkg/trace/test/example_test.go create mode 100644 pkg/trace/test/runner.go create mode 100644 pkg/trace/test/testsuite/hostname_test.go create mode 100644 pkg/trace/test/testutil/backoff.go create mode 100644 pkg/trace/test/testutil/random.go create mode 100644 pkg/trace/test/testutil/sampler.go create mode 100644 pkg/trace/test/testutil/services.go create mode 100644 pkg/trace/test/testutil/span.go create mode 100644 pkg/trace/test/testutil/span_test.go create mode 100644 pkg/trace/test/testutil/stats.go create mode 100644 pkg/trace/test/testutil/stats_test.go create mode 100644 pkg/trace/test/testutil/statsd.go create mode 100644 pkg/trace/test/testutil/testutil.go create mode 100644 pkg/trace/test/testutil/trace.go create mode 100644 pkg/trace/traceutil/doc.go create mode 100644 pkg/trace/traceutil/span.go create mode 100644 pkg/trace/traceutil/span_test.go create mode 100644 pkg/trace/traceutil/trace.go create mode 100644 pkg/trace/traceutil/trace_test.go create mode 100644 pkg/trace/watchdog/info.go create mode 100644 pkg/trace/watchdog/info_test.go create mode 100644 pkg/trace/watchdog/logonpanic.go create mode 100644 pkg/trace/watchdog/logonpanic_test.go create mode 100644 pkg/trace/watchdog/net.go create mode 100644 pkg/trace/watchdog/net_windows.go create mode 100644 pkg/trace/writer/backoff/backoff.go create mode 100644 pkg/trace/writer/backoff/backoff_test.go create mode 100644 pkg/trace/writer/backoff/exponential.go create mode 100644 pkg/trace/writer/backoff/exponential_test.go create mode 100644 pkg/trace/writer/config/payload.go create mode 100644 pkg/trace/writer/config/service_writer.go create mode 100644 pkg/trace/writer/config/stats_writer.go create mode 100644 pkg/trace/writer/config/trace_writer.go create mode 100644 pkg/trace/writer/endpoint.go create mode 100644 pkg/trace/writer/endpoint_test.go create mode 100644 pkg/trace/writer/fixtures_test.go create mode 100644 pkg/trace/writer/multi.go create mode 100644 pkg/trace/writer/multi_test.go create mode 100644 pkg/trace/writer/payload.go create mode 100644 pkg/trace/writer/payload_test.go create mode 100644 pkg/trace/writer/service.go create mode 100644 pkg/trace/writer/service_test.go create mode 100644 pkg/trace/writer/stats.go create mode 100644 pkg/trace/writer/stats_test.go create mode 100644 pkg/trace/writer/trace.go create mode 100644 pkg/trace/writer/trace_test.go diff --git a/cmd/trace-agent/agent.go b/cmd/trace-agent/agent.go new file mode 100644 index 0000000000000..cf49838305d74 --- /dev/null +++ b/cmd/trace-agent/agent.go @@ -0,0 +1,352 @@ +package main + +import ( + "context" + "sync/atomic" + "time" + + log "github.com/cihub/seelog" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/api" + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/event" + "github.com/DataDog/datadog-agent/pkg/trace/filters" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/obfuscate" + "github.com/DataDog/datadog-agent/pkg/trace/osutil" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + "github.com/DataDog/datadog-agent/pkg/trace/writer" +) + +const processStatsInterval = time.Minute + +// Agent struct holds all the sub-routines structs and make the data flow between them +type Agent struct { + Receiver *api.HTTPReceiver + Concentrator *Concentrator + Blacklister *filters.Blacklister + Replacer *filters.Replacer + ScoreSampler *Sampler + ErrorsScoreSampler *Sampler + PrioritySampler *Sampler + EventProcessor *event.Processor + TraceWriter *writer.TraceWriter + ServiceWriter *writer.ServiceWriter + StatsWriter *writer.StatsWriter + ServiceExtractor *TraceServiceExtractor + ServiceMapper *ServiceMapper + + // obfuscator is used to obfuscate sensitive data from various span + // tags based on their type. + obfuscator *obfuscate.Obfuscator + + tracePkgChan chan *writer.TracePackage + + // config + conf *config.AgentConfig + dynConf *sampler.DynamicConfig + + // Used to synchronize on a clean exit + ctx context.Context +} + +// NewAgent returns a new Agent object, ready to be started. It takes a context +// which may be cancelled in order to gracefully stop the agent. +func NewAgent(ctx context.Context, conf *config.AgentConfig) *Agent { + dynConf := sampler.NewDynamicConfig(conf.DefaultEnv) + + // inter-component channels + rawTraceChan := make(chan pb.Trace, 5000) // about 1000 traces/sec for 5 sec, TODO: move to *model.Trace + tracePkgChan := make(chan *writer.TracePackage) + statsChan := make(chan []agent.StatsBucket) + serviceChan := make(chan pb.ServicesMetadata, 50) + filteredServiceChan := make(chan pb.ServicesMetadata, 50) + + // create components + r := api.NewHTTPReceiver(conf, dynConf, rawTraceChan, serviceChan) + c := NewConcentrator( + conf.ExtraAggregators, + conf.BucketInterval.Nanoseconds(), + statsChan, + ) + + obf := obfuscate.NewObfuscator(conf.Obfuscation) + ss := NewScoreSampler(conf) + ess := NewErrorsSampler(conf) + ps := NewPrioritySampler(conf, dynConf) + ep := eventProcessorFromConf(conf) + se := NewTraceServiceExtractor(serviceChan) + sm := NewServiceMapper(serviceChan, filteredServiceChan) + tw := writer.NewTraceWriter(conf, tracePkgChan) + sw := writer.NewStatsWriter(conf, statsChan) + svcW := writer.NewServiceWriter(conf, filteredServiceChan) + + return &Agent{ + Receiver: r, + Concentrator: c, + Blacklister: filters.NewBlacklister(conf.Ignore["resource"]), + Replacer: filters.NewReplacer(conf.ReplaceTags), + ScoreSampler: ss, + ErrorsScoreSampler: ess, + PrioritySampler: ps, + EventProcessor: ep, + TraceWriter: tw, + StatsWriter: sw, + ServiceWriter: svcW, + ServiceExtractor: se, + ServiceMapper: sm, + obfuscator: obf, + tracePkgChan: tracePkgChan, + conf: conf, + dynConf: dynConf, + ctx: ctx, + } +} + +// Run starts routers routines and individual pieces then stop them when the exit order is received +func (a *Agent) Run() { + // it's really important to use a ticker for this, and with a not too short + // interval, for this is our guarantee that the process won't start and kill + // itself too fast (nightmare loop) + watchdogTicker := time.NewTicker(a.conf.WatchdogInterval) + defer watchdogTicker.Stop() + + // update the data served by expvar so that we don't expose a 0 sample rate + info.UpdatePreSampler(*a.Receiver.PreSampler.Stats()) + + // TODO: unify components APIs. Use Start/Stop as non-blocking ways of controlling the blocking Run loop. + // Like we do with TraceWriter. + a.Receiver.Run() + a.TraceWriter.Start() + a.StatsWriter.Start() + a.ServiceMapper.Start() + a.ServiceWriter.Start() + a.Concentrator.Start() + a.ScoreSampler.Run() + a.ErrorsScoreSampler.Run() + a.PrioritySampler.Run() + a.EventProcessor.Start() + + for { + select { + case t := <-a.Receiver.Out: + a.Process(t) + case <-watchdogTicker.C: + a.watchdog() + case <-a.ctx.Done(): + log.Info("exiting") + if err := a.Receiver.Stop(); err != nil { + log.Error(err) + } + a.Concentrator.Stop() + a.TraceWriter.Stop() + a.StatsWriter.Stop() + a.ServiceMapper.Stop() + a.ServiceWriter.Stop() + a.ScoreSampler.Stop() + a.ErrorsScoreSampler.Stop() + a.PrioritySampler.Stop() + a.EventProcessor.Stop() + return + } + } +} + +// Process is the default work unit that receives a trace, transforms it and +// passes it downstream. +func (a *Agent) Process(t pb.Trace) { + if len(t) == 0 { + log.Debugf("skipping received empty trace") + return + } + + // Root span is used to carry some trace-level metadata, such as sampling rate and priority. + root := traceutil.GetRoot(t) + + // We get the address of the struct holding the stats associated to no tags. + // TODO: get the real tagStats related to this trace payload (per lang/version). + ts := a.Receiver.Stats.GetTagStats(info.Tags{}) + + // Extract priority early, as later goroutines might manipulate the Metrics map in parallel which isn't safe. + priority, hasPriority := sampler.GetSamplingPriority(root) + + // Depending on the sampling priority, count that trace differently. + stat := &ts.TracesPriorityNone + if hasPriority { + if priority < 0 { + stat = &ts.TracesPriorityNeg + } else if priority == 0 { + stat = &ts.TracesPriority0 + } else if priority == 1 { + stat = &ts.TracesPriority1 + } else { + stat = &ts.TracesPriority2 + } + } + atomic.AddInt64(stat, 1) + + if !a.Blacklister.Allows(root) { + log.Debugf("trace rejected by blacklister. root: %v", root) + atomic.AddInt64(&ts.TracesFiltered, 1) + atomic.AddInt64(&ts.SpansFiltered, int64(len(t))) + return + } + + // Extra sanitization steps of the trace. + for _, span := range t { + a.obfuscator.Obfuscate(span) + agent.Truncate(span) + } + a.Replacer.Replace(&t) + + // Extract the client sampling rate. + clientSampleRate := sampler.GetGlobalRate(root) + sampler.SetClientRate(root, clientSampleRate) + // Combine it with the pre-sampling rate. + preSamplerRate := a.Receiver.PreSampler.Rate() + sampler.SetPreSampleRate(root, preSamplerRate) + // Update root's global sample rate to include the presampler rate as well + sampler.AddGlobalRate(root, preSamplerRate) + + // Figure out the top-level spans and sublayers now as it involves modifying the Metrics map + // which is not thread-safe while samplers and Concentrator might modify it too. + traceutil.ComputeTopLevel(t) + + subtraces := ExtractTopLevelSubtraces(t, root) + sublayers := make(map[*pb.Span][]agent.SublayerValue) + for _, subtrace := range subtraces { + subtraceSublayers := agent.ComputeSublayers(subtrace.Trace) + sublayers[subtrace.Root] = subtraceSublayers + agent.SetSublayersOnSpan(subtrace.Root, subtraceSublayers) + } + + pt := agent.ProcessedTrace{ + Trace: t, + WeightedTrace: agent.NewWeightedTrace(t, root), + Root: root, + Env: a.conf.DefaultEnv, + Sublayers: sublayers, + } + // Replace Agent-configured environment with `env` coming from span tag. + if tenv := traceutil.GetEnv(t); tenv != "" { + pt.Env = tenv + } + + go func() { + defer watchdog.LogOnPanic() + a.ServiceExtractor.Process(pt.WeightedTrace) + }() + + go func(pt agent.ProcessedTrace) { + defer watchdog.LogOnPanic() + // Everything is sent to concentrator for stats, regardless of sampling. + a.Concentrator.Add(pt) + }(pt) + + // Don't go through sampling for < 0 priority traces + if priority < 0 { + return + } + // Run both full trace sampling and transaction extraction in another goroutine. + go func(pt agent.ProcessedTrace) { + defer watchdog.LogOnPanic() + + tracePkg := writer.TracePackage{} + + sampled, rate := a.sample(pt) + + if sampled { + pt.Sampled = sampled + sampler.AddGlobalRate(pt.Root, rate) + tracePkg.Trace = pt.Trace + } + + // NOTE: Events can be processed on non-sampled traces. + events, numExtracted := a.EventProcessor.Process(pt) + tracePkg.Events = events + + atomic.AddInt64(&ts.EventsExtracted, int64(numExtracted)) + atomic.AddInt64(&ts.EventsSampled, int64(len(tracePkg.Events))) + + if !tracePkg.Empty() { + a.tracePkgChan <- &tracePkg + } + }(pt) +} + +func (a *Agent) sample(pt agent.ProcessedTrace) (sampled bool, rate float64) { + var sampledPriority, sampledScore bool + var ratePriority, rateScore float64 + + if _, ok := pt.GetSamplingPriority(); ok { + sampledPriority, ratePriority = a.PrioritySampler.Add(pt) + } + + if traceContainsError(pt.Trace) { + sampledScore, rateScore = a.ErrorsScoreSampler.Add(pt) + } else { + sampledScore, rateScore = a.ScoreSampler.Add(pt) + } + + return sampledScore || sampledPriority, sampler.CombineRates(ratePriority, rateScore) +} + +// dieFunc is used by watchdog to kill the agent; replaced in tests. +var dieFunc = func(fmt string, args ...interface{}) { + osutil.Exitf(fmt, args...) +} + +func (a *Agent) watchdog() { + var wi watchdog.Info + wi.CPU = watchdog.CPU() + wi.Mem = watchdog.Mem() + wi.Net = watchdog.Net() + + if float64(wi.Mem.Alloc) > a.conf.MaxMemory && a.conf.MaxMemory > 0 { + dieFunc("exceeded max memory (current=%d, max=%d)", wi.Mem.Alloc, int64(a.conf.MaxMemory)) + } + if int(wi.Net.Connections) > a.conf.MaxConnections && a.conf.MaxConnections > 0 { + dieFunc("exceeded max connections (current=%d, max=%d)", wi.Net.Connections, a.conf.MaxConnections) + } + + info.UpdateWatchdogInfo(wi) + + // Adjust pre-sampling dynamically + rate, err := sampler.CalcPreSampleRate(a.conf.MaxCPU, wi.CPU.UserAvg, a.Receiver.PreSampler.RealRate()) + if err != nil { + log.Warnf("problem computing pre-sample rate: %v", err) + } + a.Receiver.PreSampler.SetRate(rate) + a.Receiver.PreSampler.SetError(err) + + preSamplerStats := a.Receiver.PreSampler.Stats() + metrics.Gauge("datadog.trace_agent.presampler_rate", preSamplerStats.Rate, nil, 1) + info.UpdatePreSampler(*preSamplerStats) +} + +func traceContainsError(trace pb.Trace) bool { + for _, span := range trace { + if span.Error != 0 { + return true + } + } + return false +} + +func eventProcessorFromConf(conf *config.AgentConfig) *event.Processor { + extractors := []event.Extractor{ + event.NewMetricBasedExtractor(), + } + if len(conf.AnalyzedSpansByService) > 0 { + extractors = append(extractors, event.NewFixedRateExtractor(conf.AnalyzedSpansByService)) + } else if len(conf.AnalyzedRateByServiceLegacy) > 0 { + extractors = append(extractors, event.NewLegacyExtractor(conf.AnalyzedRateByServiceLegacy)) + } + + return event.NewProcessor(extractors, conf.MaxEPS) +} diff --git a/cmd/trace-agent/agent_test.go b/cmd/trace-agent/agent_test.go new file mode 100644 index 0000000000000..e5398281703d8 --- /dev/null +++ b/cmd/trace-agent/agent_test.go @@ -0,0 +1,589 @@ +package main + +import ( + "context" + "fmt" + "math" + "net/http" + "os" + "regexp" + "runtime" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/event" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" + "github.com/stretchr/testify/assert" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/obfuscate" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" +) + +type mockSamplerEngine struct { + engine sampler.Engine +} + +func newMockSampler(wantSampled bool, wantRate float64) *Sampler { + return &Sampler{engine: testutil.NewMockEngine(wantSampled, wantRate)} +} + +func TestWatchdog(t *testing.T) { + if testing.Short() { + return + } + + conf := config.New() + conf.Endpoints[0].APIKey = "apikey_2" + conf.MaxMemory = 1e7 + conf.WatchdogInterval = time.Millisecond + + // save the global mux aside, we don't want to break other tests + defaultMux := http.DefaultServeMux + http.DefaultServeMux = http.NewServeMux() + + ctx, cancelFunc := context.WithCancel(context.Background()) + agnt := NewAgent(ctx, conf) + + defer func() { + cancelFunc() + // We need to manually close the receiver as the Run() func + // should have been broken and interrupted by the watchdog panic + agnt.Receiver.Stop() + http.DefaultServeMux = defaultMux + }() + + var killed bool + defer func() { + if r := recover(); r != nil { + killed = true + switch v := r.(type) { + case string: + if strings.HasPrefix(v, "exceeded max memory") { + t.Logf("watchdog worked, trapped the right error: %s", v) + runtime.GC() // make sure we clean up after allocating all this + return + } + } + t.Fatalf("unexpected error: %v", r) + } + }() + + // allocating a lot of memory + buf := make([]byte, 2*int64(conf.MaxMemory)) + buf[0] = 1 + buf[len(buf)-1] = 1 + + // override the default die, else our test would stop, use a plain panic() instead + oldDie := dieFunc + defer func() { dieFunc = oldDie }() + dieFunc = func(format string, args ...interface{}) { + panic(fmt.Sprintf(format, args...)) + } + + // after some time, the watchdog should kill this + agnt.Run() + + // without this. runtime could be smart and free memory before we Run() + buf[0] = 2 + buf[len(buf)-1] = 2 + + assert.True(t, killed) +} + +// Test to make sure that the joined effort of the quantizer and truncator, in that order, produce the +// desired string +func TestFormatTrace(t *testing.T) { + assert := assert.New(t) + resource := "SELECT name FROM people WHERE age = 42" + rep := strings.Repeat(" AND age = 42", 5000) + resource = resource + rep + testTrace := pb.Trace{ + &pb.Span{ + Resource: resource, + Type: "sql", + }, + } + result := formatTrace(testTrace)[0] + + assert.Equal(5000, len(result.Resource)) + assert.NotEqual("Non-parsable SQL query", result.Resource) + assert.NotContains(result.Resource, "42") + assert.Contains(result.Resource, "SELECT name FROM people WHERE age = ?") + + assert.Equal(5003, len(result.Meta["sql.query"])) // Ellipsis added in quantizer + assert.NotEqual("Non-parsable SQL query", result.Meta["sql.query"]) + assert.NotContains(result.Meta["sql.query"], "42") + assert.Contains(result.Meta["sql.query"], "SELECT name FROM people WHERE age = ?") +} + +func TestProcess(t *testing.T) { + t.Run("Replacer", func(t *testing.T) { + // Ensures that for "sql" type spans: + // • obfuscator runs before replacer + // • obfuscator obfuscates both resource and "sql.query" tag + // • resulting resource is obfuscated with replacements applied + // • resulting "sql.query" tag is obfuscated with no replacements applied + cfg := config.New() + cfg.Endpoints[0].APIKey = "test" + cfg.ReplaceTags = []*config.ReplaceRule{{ + Name: "resource.name", + Re: regexp.MustCompile("AND.*"), + Repl: "...", + }} + ctx, cancel := context.WithCancel(context.Background()) + agnt := NewAgent(ctx, cfg) + defer cancel() + + now := time.Now() + span := &pb.Span{ + Resource: "SELECT name FROM people WHERE age = 42 AND extra = 55", + Type: "sql", + Start: now.Add(-time.Second).UnixNano(), + Duration: (500 * time.Millisecond).Nanoseconds(), + } + agnt.Process(pb.Trace{span}) + + assert := assert.New(t) + assert.Equal("SELECT name FROM people WHERE age = ? ...", span.Resource) + assert.Equal("SELECT name FROM people WHERE age = ? AND extra = ?", span.Meta["sql.query"]) + }) + + t.Run("Blacklister", func(t *testing.T) { + cfg := config.New() + cfg.Endpoints[0].APIKey = "test" + cfg.Ignore["resource"] = []string{"^INSERT.*"} + ctx, cancel := context.WithCancel(context.Background()) + agnt := NewAgent(ctx, cfg) + defer cancel() + + now := time.Now() + spanValid := &pb.Span{ + Resource: "SELECT name FROM people WHERE age = 42 AND extra = 55", + Type: "sql", + Start: now.Add(-time.Second).UnixNano(), + Duration: (500 * time.Millisecond).Nanoseconds(), + } + spanInvalid := &pb.Span{ + Resource: "INSERT INTO db VALUES (1, 2, 3)", + Type: "sql", + Start: now.Add(-time.Second).UnixNano(), + Duration: (500 * time.Millisecond).Nanoseconds(), + } + + stats := agnt.Receiver.Stats.GetTagStats(info.Tags{}) + assert := assert.New(t) + + agnt.Process(pb.Trace{spanValid}) + assert.EqualValues(0, stats.TracesFiltered) + assert.EqualValues(0, stats.SpansFiltered) + + agnt.Process(pb.Trace{spanInvalid, spanInvalid}) + assert.EqualValues(1, stats.TracesFiltered) + assert.EqualValues(2, stats.SpansFiltered) + }) + + t.Run("Stats/Priority", func(t *testing.T) { + cfg := config.New() + cfg.Endpoints[0].APIKey = "test" + ctx, cancel := context.WithCancel(context.Background()) + agnt := NewAgent(ctx, cfg) + defer cancel() + + now := time.Now() + for _, key := range []sampler.SamplingPriority{ + sampler.PriorityNone, + sampler.PriorityUserDrop, + sampler.PriorityUserDrop, + sampler.PriorityAutoDrop, + sampler.PriorityAutoDrop, + sampler.PriorityAutoDrop, + sampler.PriorityAutoKeep, + sampler.PriorityAutoKeep, + sampler.PriorityAutoKeep, + sampler.PriorityAutoKeep, + sampler.PriorityUserKeep, + sampler.PriorityUserKeep, + sampler.PriorityUserKeep, + sampler.PriorityUserKeep, + sampler.PriorityUserKeep, + } { + span := &pb.Span{ + Resource: "SELECT name FROM people WHERE age = 42 AND extra = 55", + Type: "sql", + Start: now.Add(-time.Second).UnixNano(), + Duration: (500 * time.Millisecond).Nanoseconds(), + Metrics: map[string]float64{}, + } + if key != sampler.PriorityNone { + sampler.SetSamplingPriority(span, key) + } + agnt.Process(pb.Trace{span}) + } + + stats := agnt.Receiver.Stats.GetTagStats(info.Tags{}) + assert.EqualValues(t, 1, stats.TracesPriorityNone) + assert.EqualValues(t, 2, stats.TracesPriorityNeg) + assert.EqualValues(t, 3, stats.TracesPriority0) + assert.EqualValues(t, 4, stats.TracesPriority1) + assert.EqualValues(t, 5, stats.TracesPriority2) + }) +} + +func TestSampling(t *testing.T) { + for name, tt := range map[string]struct { + // hasErrors will be true if the input trace should have errors + // hasPriority will be true if the input trace should have sampling priority set + hasErrors, hasPriority bool + + // scoreRate, scoreErrorRate, priorityRate are the rates used by the mock samplers + scoreRate, scoreErrorRate, priorityRate float64 + + // scoreSampled, scoreErrorSampled, prioritySampled are the sample decisions of the mock samplers + scoreSampled, scoreErrorSampled, prioritySampled bool + + // wantRate and wantSampled are the expected result + wantRate float64 + wantSampled bool + }{ + "score and priority rate": { + hasPriority: true, + scoreRate: 0.5, + priorityRate: 0.6, + wantRate: sampler.CombineRates(0.5, 0.6), + }, + "score only rate": { + scoreRate: 0.5, + priorityRate: 0.1, + wantRate: 0.5, + }, + "error and priority rate": { + hasErrors: true, + hasPriority: true, + scoreErrorRate: 0.8, + priorityRate: 0.2, + wantRate: sampler.CombineRates(0.8, 0.2), + }, + "score not sampled decision": { + scoreSampled: false, + wantSampled: false, + }, + "score sampled decision": { + scoreSampled: true, + wantSampled: true, + }, + "score sampled priority not sampled": { + hasPriority: true, + scoreSampled: true, + prioritySampled: false, + wantSampled: true, + }, + "score not sampled priority sampled": { + hasPriority: true, + scoreSampled: false, + prioritySampled: true, + wantSampled: true, + }, + "score sampled priority sampled": { + hasPriority: true, + scoreSampled: true, + prioritySampled: true, + wantSampled: true, + }, + "score and priority not sampled": { + hasPriority: true, + scoreSampled: false, + prioritySampled: false, + wantSampled: false, + }, + "error not sampled decision": { + hasErrors: true, + scoreErrorSampled: false, + wantSampled: false, + }, + "error sampled decision": { + hasErrors: true, + scoreErrorSampled: true, + wantSampled: true, + }, + "error sampled priority not sampled": { + hasErrors: true, + hasPriority: true, + scoreErrorSampled: true, + prioritySampled: false, + wantSampled: true, + }, + "error not sampled priority sampled": { + hasErrors: true, + hasPriority: true, + scoreErrorSampled: false, + prioritySampled: true, + wantSampled: true, + }, + "error sampled priority sampled": { + hasErrors: true, + hasPriority: true, + scoreErrorSampled: true, + prioritySampled: true, + wantSampled: true, + }, + "error and priority not sampled": { + hasErrors: true, + hasPriority: true, + scoreErrorSampled: false, + prioritySampled: false, + wantSampled: false, + }, + } { + t.Run(name, func(t *testing.T) { + a := &Agent{ + ScoreSampler: newMockSampler(tt.scoreSampled, tt.scoreRate), + ErrorsScoreSampler: newMockSampler(tt.scoreErrorSampled, tt.scoreErrorRate), + PrioritySampler: newMockSampler(tt.prioritySampled, tt.priorityRate), + } + root := &pb.Span{ + Service: "serv1", + Start: time.Now().UnixNano(), + Duration: (100 * time.Millisecond).Nanoseconds(), + Metrics: map[string]float64{}, + } + + if tt.hasErrors { + root.Error = 1 + } + pt := agent.ProcessedTrace{Trace: pb.Trace{root}, Root: root} + if tt.hasPriority { + sampler.SetSamplingPriority(pt.Root, 1) + } + + sampled, rate := a.sample(pt) + assert.EqualValues(t, tt.wantRate, rate) + assert.EqualValues(t, tt.wantSampled, sampled) + }) + } +} + +func TestEventProcessorFromConf(t *testing.T) { + if _, ok := os.LookupEnv("INTEGRATION"); !ok { + t.Skip("set INTEGRATION environment variable to run") + } + if testing.Short() { + return + } + testMaxEPS := 100. + rateByServiceAndName := map[string]map[string]float64{ + "serviceA": { + "opA": 0, + "opC": 1, + }, + "serviceB": { + "opB": 0.5, + }, + } + rateByService := map[string]float64{ + "serviceA": 1, + "serviceC": 0.5, + "serviceD": 1, + } + + for _, testCase := range []eventProcessorTestCase{ + // Name: //priority + {name: "none/below/none", intakeSPS: 100, serviceName: "serviceE", opName: "opA", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 0, deltaPct: 0, duration: 10 * time.Second}, + {name: "metric/below/none", intakeSPS: 100, serviceName: "serviceD", opName: "opA", extractionRate: 0.5, priority: sampler.PriorityNone, expectedEPS: 50, deltaPct: 0.1, duration: 10 * time.Second}, + {name: "metric/above/none", intakeSPS: 200, serviceName: "serviceD", opName: "opA", extractionRate: 1, priority: sampler.PriorityNone, expectedEPS: 100, deltaPct: 0.5, duration: 60 * time.Second}, + {name: "fixed/below/none", intakeSPS: 100, serviceName: "serviceB", opName: "opB", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 50, deltaPct: 0.1, duration: 10 * time.Second}, + {name: "fixed/above/none", intakeSPS: 200, serviceName: "serviceA", opName: "opC", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 100, deltaPct: 0.5, duration: 60 * time.Second}, + {name: "fixed/above/autokeep", intakeSPS: 200, serviceName: "serviceA", opName: "opC", extractionRate: -1, priority: sampler.PriorityAutoKeep, expectedEPS: 100, deltaPct: 0.5, duration: 60 * time.Second}, + {name: "metric/above/autokeep", intakeSPS: 200, serviceName: "serviceD", opName: "opA", extractionRate: 1, priority: sampler.PriorityAutoKeep, expectedEPS: 100, deltaPct: 0.5, duration: 60 * time.Second}, + // UserKeep traces allows overflow of EPS + {name: "metric/above/userkeep", intakeSPS: 200, serviceName: "serviceD", opName: "opA", extractionRate: 1, priority: sampler.PriorityUserKeep, expectedEPS: 200, deltaPct: 0.1, duration: 10 * time.Second}, + {name: "agent/above/userkeep", intakeSPS: 200, serviceName: "serviceA", opName: "opC", extractionRate: -1, priority: sampler.PriorityUserKeep, expectedEPS: 200, deltaPct: 0.1, duration: 10 * time.Second}, + + // Overrides (Name: /override/) + {name: "metric/override/fixed", intakeSPS: 100, serviceName: "serviceA", opName: "opA", extractionRate: 1, priority: sampler.PriorityNone, expectedEPS: 100, deltaPct: 0.1, duration: 10 * time.Second}, + // Legacy should never be considered if fixed rate is being used. + {name: "fixed/override/legacy", intakeSPS: 100, serviceName: "serviceA", opName: "opD", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 0, deltaPct: 0, duration: 10 * time.Second}, + } { + testEventProcessorFromConf(t, &config.AgentConfig{ + MaxEPS: testMaxEPS, + AnalyzedSpansByService: rateByServiceAndName, + AnalyzedRateByServiceLegacy: rateByService, + }, testCase) + } +} + +func TestEventProcessorFromConfLegacy(t *testing.T) { + if _, ok := os.LookupEnv("INTEGRATION"); !ok { + t.Skip("set INTEGRATION environment variable to run") + } + + testMaxEPS := 100. + + rateByService := map[string]float64{ + "serviceA": 1, + "serviceC": 0.5, + "serviceD": 1, + } + + for _, testCase := range []eventProcessorTestCase{ + // Name: //priority + {name: "none/below/none", intakeSPS: 100, serviceName: "serviceE", opName: "opA", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 0, deltaPct: 0, duration: 10 * time.Second}, + {name: "legacy/below/none", intakeSPS: 100, serviceName: "serviceC", opName: "opB", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 50, deltaPct: 0.1, duration: 10 * time.Second}, + {name: "legacy/above/none", intakeSPS: 200, serviceName: "serviceD", opName: "opC", extractionRate: -1, priority: sampler.PriorityNone, expectedEPS: 100, deltaPct: 0.5, duration: 60 * time.Second}, + {name: "legacy/above/autokeep", intakeSPS: 200, serviceName: "serviceD", opName: "opC", extractionRate: -1, priority: sampler.PriorityAutoKeep, expectedEPS: 100, deltaPct: 0.5, duration: 60 * time.Second}, + // UserKeep traces allows overflow of EPS + {name: "legacy/above/userkeep", intakeSPS: 200, serviceName: "serviceD", opName: "opC", extractionRate: -1, priority: sampler.PriorityUserKeep, expectedEPS: 200, deltaPct: 0.1, duration: 10 * time.Second}, + + // Overrides (Name: /override/) + {name: "metrics/overrides/legacy", intakeSPS: 100, serviceName: "serviceC", opName: "opC", extractionRate: 1, priority: sampler.PriorityNone, expectedEPS: 100, deltaPct: 0.1, duration: 10 * time.Second}, + } { + testEventProcessorFromConf(t, &config.AgentConfig{ + MaxEPS: testMaxEPS, + AnalyzedRateByServiceLegacy: rateByService, + }, testCase) + } +} + +type eventProcessorTestCase struct { + name string + intakeSPS float64 + serviceName string + opName string + extractionRate float64 + priority sampler.SamplingPriority + expectedEPS float64 + deltaPct float64 + duration time.Duration +} + +func testEventProcessorFromConf(t *testing.T, conf *config.AgentConfig, testCase eventProcessorTestCase) { + t.Run(testCase.name, func(t *testing.T) { + processor := eventProcessorFromConf(conf) + processor.Start() + + actualEPS := generateTraffic(processor, testCase.serviceName, testCase.opName, testCase.extractionRate, + testCase.duration, testCase.intakeSPS, testCase.priority) + + processor.Stop() + + assert.InDelta(t, testCase.expectedEPS, actualEPS, testCase.expectedEPS*testCase.deltaPct) + }) +} + +// generateTraffic generates traces every 100ms with enough spans to meet the desired `intakeSPS` (intake spans per +// second). These spans will all have the provided service and operation names and be set as extractable/sampled +// based on the associated rate/%. This traffic generation will run for the specified `duration`. +func generateTraffic(processor *event.Processor, serviceName string, operationName string, extractionRate float64, + duration time.Duration, intakeSPS float64, priority sampler.SamplingPriority) float64 { + tickerInterval := 100 * time.Millisecond + totalSampled := 0 + timer := time.NewTimer(duration) + eventTicker := time.NewTicker(tickerInterval) + defer eventTicker.Stop() + numTicksInSecond := float64(time.Second) / float64(tickerInterval) + spansPerTick := int(math.Round(float64(intakeSPS) / numTicksInSecond)) + +Loop: + for { + spans := make([]*agent.WeightedSpan, spansPerTick) + for i := range spans { + span := testutil.RandomSpan() + span.Service = serviceName + span.Name = operationName + if extractionRate >= 0 { + span.Metrics[sampler.KeySamplingRateEventExtraction] = extractionRate + } + spans[i] = &agent.WeightedSpan{ + Span: span, + // Make all spans top level for simpler testing of legacy extractor + TopLevel: true, + } + } + trace := agent.ProcessedTrace{ + WeightedTrace: agent.WeightedTrace(spans), + Root: spans[0].Span, + } + if priority != sampler.PriorityNone { + sampler.SetSamplingPriority(trace.Root, priority) + } + + events, _ := processor.Process(trace) + totalSampled += len(events) + + <-eventTicker.C + select { + case <-timer.C: + // If timer ran out, break out of loop and stop generation + break Loop + default: + // Otherwise, lets generate another + } + } + return float64(totalSampled) / duration.Seconds() +} + +func BenchmarkAgentTraceProcessing(b *testing.B) { + c := config.New() + c.Endpoints[0].APIKey = "test" + + runTraceProcessingBenchmark(b, c) +} + +func BenchmarkAgentTraceProcessingWithFiltering(b *testing.B) { + c := config.New() + c.Endpoints[0].APIKey = "test" + c.Ignore["resource"] = []string{"[0-9]{3}", "foobar", "G.T [a-z]+", "[^123]+_baz"} + + runTraceProcessingBenchmark(b, c) +} + +// worst case scenario: spans are tested against multiple rules without any match. +// this means we won't compesate the overhead of filtering by dropping traces +func BenchmarkAgentTraceProcessingWithWorstCaseFiltering(b *testing.B) { + c := config.New() + c.Endpoints[0].APIKey = "test" + c.Ignore["resource"] = []string{"[0-9]{3}", "foobar", "aaaaa?aaaa", "[^123]+_baz"} + + runTraceProcessingBenchmark(b, c) +} + +func runTraceProcessingBenchmark(b *testing.B, c *config.AgentConfig) { + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() + agent := NewAgent(ctx, c) + log.UseLogger(log.Disabled) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + agent.Process(testutil.RandomTrace(10, 8)) + } +} + +func BenchmarkWatchdog(b *testing.B) { + conf := config.New() + conf.Endpoints[0].APIKey = "apikey_2" + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() + agent := NewAgent(ctx, conf) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + agent.watchdog() + } +} + +// Mimicks behaviour of agent Process function +func formatTrace(t pb.Trace) pb.Trace { + for _, span := range t { + obfuscate.NewObfuscator(nil).Obfuscate(span) + agent.Truncate(span) + } + return t +} diff --git a/cmd/trace-agent/concentrator.go b/cmd/trace-agent/concentrator.go new file mode 100644 index 0000000000000..827dc99a80729 --- /dev/null +++ b/cmd/trace-agent/concentrator.go @@ -0,0 +1,176 @@ +package main + +import ( + "sort" + "sync" + "time" + + log "github.com/cihub/seelog" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +// defaultBufferLen represents the default buffer length; the number of bucket size +// units used by the concentrator. +const defaultBufferLen = 2 + +// Concentrator produces time bucketed statistics from a stream of raw traces. +// https://en.wikipedia.org/wiki/Knelson_concentrator +// Gets an imperial shitton of traces, and outputs pre-computed data structures +// allowing to find the gold (stats) amongst the traces. +type Concentrator struct { + // list of attributes to use for extra aggregation + aggregators []string + // bucket duration in nanoseconds + bsize int64 + // Timestamp of the oldest time bucket for which we allow data. + // Any ingested stats older than it get added to this bucket. + oldestTs int64 + // bufferLen is the number of 10s stats bucket we keep in memory before flushing them. + // It means that we can compute stats only for the last `bufferLen * bsize` and that we + // wait such time before flushing the stats. + // This only applies to past buckets. Stats buckets in the future are allowed with no restriction. + bufferLen int + + OutStats chan []agent.StatsBucket + + exit chan struct{} + exitWG *sync.WaitGroup + + buckets map[int64]*agent.StatsRawBucket // buckets used to aggregate stats per timestamp + mu sync.Mutex +} + +// NewConcentrator initializes a new concentrator ready to be started +func NewConcentrator(aggregators []string, bsize int64, out chan []agent.StatsBucket) *Concentrator { + c := Concentrator{ + aggregators: aggregators, + bsize: bsize, + buckets: make(map[int64]*agent.StatsRawBucket), + // At start, only allow stats for the current time bucket. Ensure we don't + // override buckets which could have been sent before an Agent restart. + oldestTs: alignTs(time.Now().UnixNano(), bsize), + // TODO: Move to configuration. + bufferLen: defaultBufferLen, + + OutStats: out, + + exit: make(chan struct{}), + exitWG: &sync.WaitGroup{}, + } + sort.Strings(c.aggregators) + return &c +} + +// Start starts the concentrator. +func (c *Concentrator) Start() { + go func() { + defer watchdog.LogOnPanic() + c.Run() + }() +} + +// Run runs the main loop of the concentrator goroutine. Traces are received +// through `Add`, this loop only deals with flushing. +func (c *Concentrator) Run() { + c.exitWG.Add(1) + defer c.exitWG.Done() + + // flush with the same period as stats buckets + flushTicker := time.NewTicker(time.Duration(c.bsize) * time.Nanosecond) + defer flushTicker.Stop() + + log.Debug("starting concentrator") + + for { + select { + case <-flushTicker.C: + c.OutStats <- c.Flush() + case <-c.exit: + log.Info("exiting concentrator, computing remaining stats") + c.OutStats <- c.Flush() + return + } + } +} + +// Stop stops the main Run loop. +func (c *Concentrator) Stop() { + close(c.exit) + c.exitWG.Wait() +} + +// Add appends to the proper stats bucket this trace's statistics +func (c *Concentrator) Add(t agent.ProcessedTrace) { + c.addNow(t, time.Now().UnixNano()) +} + +func (c *Concentrator) addNow(t agent.ProcessedTrace, now int64) { + c.mu.Lock() + + for _, s := range t.WeightedTrace { + // We do not compute stats for non top level spans since this is not surfaced in the UI + if !s.TopLevel { + continue + } + end := s.Start + s.Duration + btime := end - end%c.bsize + + // // If too far in the past, count in the oldest-allowed time bucket instead. + if btime < c.oldestTs { + btime = c.oldestTs + } + + b, ok := c.buckets[btime] + if !ok { + b = agent.NewStatsRawBucket(btime, c.bsize) + c.buckets[btime] = b + } + + sublayers, _ := t.Sublayers[s.Span] + b.HandleSpan(s, t.Env, c.aggregators, sublayers) + } + + c.mu.Unlock() +} + +// Flush deletes and returns complete statistic buckets +func (c *Concentrator) Flush() []agent.StatsBucket { + return c.flushNow(time.Now().UnixNano()) +} + +func (c *Concentrator) flushNow(now int64) []agent.StatsBucket { + var sb []agent.StatsBucket + + c.mu.Lock() + for ts, srb := range c.buckets { + // Always keep `bufferLen` buckets (default is 2: current + previous one). + // This is a trade-off: we accept slightly late traces (clock skew and stuff) + // but we delay flushing by at most `bufferLen` buckets. + if ts > now-int64(c.bufferLen)*c.bsize { + continue + } + log.Debugf("flushing bucket %d", ts) + sb = append(sb, srb.Export()) + delete(c.buckets, ts) + } + + // After flushing, update the oldest timestamp allowed to prevent having stats for + // an already-flushed bucket. + newOldestTs := alignTs(now, c.bsize) - int64(c.bufferLen-1)*c.bsize + if newOldestTs > c.oldestTs { + log.Debugf("update oldestTs to %d", newOldestTs) + c.oldestTs = newOldestTs + } + + c.mu.Unlock() + + return sb +} + +// alignTs returns the provided timestamp truncated to the bucket size. +// It gives us the start time of the time bucket in which such timestamp falls. +func alignTs(ts int64, bsize int64) int64 { + return ts - ts%bsize +} diff --git a/cmd/trace-agent/concentrator_test.go b/cmd/trace-agent/concentrator_test.go new file mode 100644 index 0000000000000..4fbeb455d0028 --- /dev/null +++ b/cmd/trace-agent/concentrator_test.go @@ -0,0 +1,421 @@ +package main + +import ( + "fmt" + "math/rand" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/stretchr/testify/assert" +) + +var testBucketInterval = time.Duration(2 * time.Second).Nanoseconds() + +func NewTestConcentrator() *Concentrator { + statsChan := make(chan []agent.StatsBucket) + return NewConcentrator([]string{}, time.Second.Nanoseconds(), statsChan) +} + +// getTsInBucket gives a timestamp in ns which is `offset` buckets late +func getTsInBucket(alignedNow int64, bsize int64, offset int64) int64 { + return alignedNow - offset*bsize + rand.Int63n(bsize) +} + +// testSpan avoids typo and inconsistency in test spans (typical pitfall: duration, start time, +// and end time are aligned, and end time is the one that needs to be aligned +func testSpan(spanID uint64, parentID uint64, duration, offset int64, service, resource string, err int32) *pb.Span { + now := time.Now().UnixNano() + alignedNow := now - now%testBucketInterval + + return &pb.Span{ + SpanID: spanID, + ParentID: parentID, + Duration: duration, + Start: getTsInBucket(alignedNow, testBucketInterval, offset) - duration, + Service: service, + Name: "query", + Resource: resource, + Error: err, + Type: "db", + } +} + +// TestConcentratorOldestTs tests that the Agent doesn't report time buckets from a +// time before its start +func TestConcentratorOldestTs(t *testing.T) { + assert := assert.New(t) + statsChan := make(chan []agent.StatsBucket) + + now := time.Now().UnixNano() + + // Build that simply have spans spread over time windows. + trace := pb.Trace{ + testSpan(1, 0, 50, 5, "A1", "resource1", 0), + testSpan(1, 0, 40, 4, "A1", "resource1", 0), + testSpan(1, 0, 30, 3, "A1", "resource1", 0), + testSpan(1, 0, 20, 2, "A1", "resource1", 0), + testSpan(1, 0, 10, 1, "A1", "resource1", 0), + testSpan(1, 0, 1, 0, "A1", "resource1", 0), + } + + traceutil.ComputeTopLevel(trace) + wt := agent.NewWeightedTrace(trace, traceutil.GetRoot(trace)) + + testTrace := agent.ProcessedTrace{ + Env: "none", + Trace: trace, + WeightedTrace: wt, + } + + t.Run("cold", func(t *testing.T) { + // Running cold, all spans in the past should end up in the current time bucket. + flushTime := now + c := NewConcentrator([]string{}, testBucketInterval, statsChan) + c.Add(testTrace) + + for i := 0; i < c.bufferLen; i++ { + stats := c.flushNow(flushTime) + if !assert.Equal(0, len(stats), "We should get exactly 0 StatsBucket") { + t.FailNow() + } + flushTime += testBucketInterval + } + + stats := c.flushNow(flushTime) + + if !assert.Equal(1, len(stats), "We should get exactly 1 StatsBucket") { + t.FailNow() + } + + // First oldest bucket aggregates old past time buckets, it should have it all. + for key, count := range stats[0].Counts { + if key == "query|duration|env:none,resource:resource1,service:A1" { + assert.Equal(151, int(count.Value), "Wrong value for duration") + } + if key == "query|hits|env:none,resource:resource1,service:A1" { + assert.Equal(6, int(count.Value), "Wrong value for hits") + } + } + }) + + t.Run("hot", func(t *testing.T) { + flushTime := now + c := NewConcentrator([]string{}, testBucketInterval, statsChan) + c.oldestTs = alignTs(now, c.bsize) - int64(c.bufferLen-1)*c.bsize + c.Add(testTrace) + + for i := 0; i < c.bufferLen-1; i++ { + stats := c.flushNow(flushTime) + if !assert.Equal(0, len(stats), "We should get exactly 0 StatsBucket") { + t.FailNow() + } + flushTime += testBucketInterval + } + + stats := c.flushNow(flushTime) + if !assert.Equal(1, len(stats), "We should get exactly 1 StatsBucket") { + t.FailNow() + } + flushTime += testBucketInterval + + // First oldest bucket aggregates, it should have it all except the last span. + for key, count := range stats[0].Counts { + if key == "query|duration|env:none,resource:resource1,service:A1" { + assert.Equal(150, int(count.Value), "Wrong value for duration") + } + if key == "query|hits|env:none,resource:resource1,service:A1" { + assert.Equal(5, int(count.Value), "Wrong value for hits") + } + } + + stats = c.flushNow(flushTime) + if !assert.Equal(1, len(stats), "We should get exactly 1 StatsBucket") { + t.FailNow() + } + + // Stats of the last span. + for key, count := range stats[0].Counts { + if key == "query|duration|env:none,resource:resource1,service:A1" { + assert.Equal(1, int(count.Value), "Wrong value for duration") + } + if key == "query|hits|env:none,resource:resource1,service:A1" { + assert.Equal(1, int(count.Value), "Wrong value for hits") + } + } + }) +} + +//TestConcentratorStatsTotals tests that the total stats are correct, independently of the +// time bucket they end up. +func TestConcentratorStatsTotals(t *testing.T) { + assert := assert.New(t) + statsChan := make(chan []agent.StatsBucket) + c := NewConcentrator([]string{}, testBucketInterval, statsChan) + + now := time.Now().UnixNano() + alignedNow := alignTs(now, c.bsize) + + // update oldestTs as it running for quite some time, to avoid the fact that at startup + // it only allows recent stats. + c.oldestTs = alignedNow - int64(c.bufferLen)*c.bsize + + // Build that simply have spans spread over time windows. + trace := pb.Trace{ + testSpan(1, 0, 50, 5, "A1", "resource1", 0), + testSpan(1, 0, 40, 4, "A1", "resource1", 0), + testSpan(1, 0, 30, 3, "A1", "resource1", 0), + testSpan(1, 0, 20, 2, "A1", "resource1", 0), + testSpan(1, 0, 10, 1, "A1", "resource1", 0), + testSpan(1, 0, 1, 0, "A1", "resource1", 0), + } + + traceutil.ComputeTopLevel(trace) + wt := agent.NewWeightedTrace(trace, traceutil.GetRoot(trace)) + + testTrace := agent.ProcessedTrace{ + Env: "none", + Trace: trace, + WeightedTrace: wt, + } + c.Add(testTrace) + + var hits float64 + var duration float64 + + flushTime := now + for i := 0; i <= c.bufferLen; i++ { + stats := c.flushNow(flushTime) + + if len(stats) == 0 { + continue + } + + for key, count := range stats[0].Counts { + if key == "query|duration|env:none,resource:resource1,service:A1" { + duration += count.Value + } + if key == "query|hits|env:none,resource:resource1,service:A1" { + hits += count.Value + } + } + flushTime += c.bsize + } + + assert.Equal(hits, float64(len(trace)), "Wrong value for total hits %d", hits) + assert.Equal(duration, float64(50+40+30+20+10+1), "Wrong value for total duration %d", duration) +} + +// TestConcentratorStatsCounts tests exhaustively each stats bucket, over multiple time buckets. +func TestConcentratorStatsCounts(t *testing.T) { + assert := assert.New(t) + statsChan := make(chan []agent.StatsBucket) + c := NewConcentrator([]string{}, testBucketInterval, statsChan) + + now := time.Now().UnixNano() + alignedNow := alignTs(now, c.bsize) + + // update oldestTs as it running for quite some time, to avoid the fact that at startup + // it only allows recent stats. + c.oldestTs = alignedNow - int64(c.bufferLen)*c.bsize + + // Build a trace with stats which should cover 3 time buckets. + trace := pb.Trace{ + // more than 2 buckets old, should be added to the 2 bucket-old, first flush. + testSpan(1, 0, 111, 10, "A1", "resource1", 0), + testSpan(1, 0, 222, 3, "A1", "resource1", 0), + // 2 buckets old, part of the first flush + testSpan(1, 0, 24, 2, "A1", "resource1", 0), + testSpan(2, 0, 12, 2, "A1", "resource1", 2), + testSpan(3, 0, 40, 2, "A2", "resource2", 2), + testSpan(4, 0, 300000000000, 2, "A2", "resource2", 2), // 5 minutes trace + testSpan(5, 0, 30, 2, "A2", "resourcefoo", 0), + // 1 bucket old, part of the second flush + testSpan(6, 0, 24, 1, "A1", "resource2", 0), + testSpan(7, 0, 12, 1, "A1", "resource1", 2), + testSpan(8, 0, 40, 1, "A2", "resource1", 2), + testSpan(9, 0, 30, 1, "A2", "resource2", 2), + testSpan(10, 0, 3600000000000, 1, "A2", "resourcefoo", 0), // 1 hour trace + // present data, part of the third flush + testSpan(6, 0, 24, 0, "A1", "resource2", 0), + } + + expectedCountValByKeyByTime := make(map[int64]map[string]int64) + expectedCountValByKeyByTime[alignedNow-2*testBucketInterval] = map[string]int64{ + "query|duration|env:none,resource:resource1,service:A1": 369, + "query|duration|env:none,resource:resource2,service:A2": 300000000040, + "query|duration|env:none,resource:resourcefoo,service:A2": 30, + "query|errors|env:none,resource:resource1,service:A1": 1, + "query|errors|env:none,resource:resource2,service:A2": 2, + "query|errors|env:none,resource:resourcefoo,service:A2": 0, + "query|hits|env:none,resource:resource1,service:A1": 4, + "query|hits|env:none,resource:resource2,service:A2": 2, + "query|hits|env:none,resource:resourcefoo,service:A2": 1, + } + expectedCountValByKeyByTime[alignedNow-1*testBucketInterval] = map[string]int64{ + "query|duration|env:none,resource:resource1,service:A1": 12, + "query|duration|env:none,resource:resource2,service:A1": 24, + "query|duration|env:none,resource:resource1,service:A2": 40, + "query|duration|env:none,resource:resource2,service:A2": 30, + "query|duration|env:none,resource:resourcefoo,service:A2": 3600000000000, + "query|errors|env:none,resource:resource1,service:A1": 1, + "query|errors|env:none,resource:resource2,service:A1": 0, + "query|errors|env:none,resource:resource1,service:A2": 1, + "query|errors|env:none,resource:resource2,service:A2": 1, + "query|errors|env:none,resource:resourcefoo,service:A2": 0, + "query|hits|env:none,resource:resource1,service:A1": 1, + "query|hits|env:none,resource:resource2,service:A1": 1, + "query|hits|env:none,resource:resource1,service:A2": 1, + "query|hits|env:none,resource:resource2,service:A2": 1, + "query|hits|env:none,resource:resourcefoo,service:A2": 1, + } + expectedCountValByKeyByTime[alignedNow] = map[string]int64{ + "query|duration|env:none,resource:resource2,service:A1": 24, + "query|errors|env:none,resource:resource2,service:A1": 0, + "query|hits|env:none,resource:resource2,service:A1": 1, + } + expectedCountValByKeyByTime[alignedNow+testBucketInterval] = map[string]int64{} + + traceutil.ComputeTopLevel(trace) + wt := agent.NewWeightedTrace(trace, traceutil.GetRoot(trace)) + + testTrace := agent.ProcessedTrace{ + Env: "none", + Trace: trace, + WeightedTrace: wt, + } + c.Add(testTrace) + + // flush every testBucketInterval + flushTime := now + for i := 0; i <= c.bufferLen+2; i++ { + t.Run(fmt.Sprintf("flush-%d", i), func(t *testing.T) { + stats := c.flushNow(flushTime) + + expectedFlushedTs := alignTs(flushTime, c.bsize) - int64(c.bufferLen)*testBucketInterval + if len(expectedCountValByKeyByTime[expectedFlushedTs]) == 0 { + // That's a flush for which we expect no data + return + } + + if !assert.Equal(1, len(stats), "We should get exactly 1 StatsBucket") { + t.FailNow() + } + + receivedBuckets := []agent.StatsBucket{stats[0]} + + assert.Equal(expectedFlushedTs, receivedBuckets[0].Start) + + expectedCountValByKey := expectedCountValByKeyByTime[expectedFlushedTs] + receivedCounts := receivedBuckets[0].Counts + + // verify we got all counts + assert.Equal(len(expectedCountValByKey), len(receivedCounts), "GOT %v", receivedCounts) + // verify values + for key, val := range expectedCountValByKey { + count, ok := receivedCounts[key] + assert.True(ok, "%s was expected from concentrator", key) + assert.Equal(val, int64(count.Value), "Wrong value for count %s", key) + } + + // Flushing again at the same time should return nothing + stats = c.flushNow(flushTime) + + if !assert.Equal(0, len(stats), "Second flush of the same time should be empty") { + t.FailNow() + } + + }) + flushTime += c.bsize + } +} + +// TestConcentratorSublayersStatsCounts tests exhaustively the sublayer stats of a single time window. +func TestConcentratorSublayersStatsCounts(t *testing.T) { + assert := assert.New(t) + statsChan := make(chan []agent.StatsBucket) + c := NewConcentrator([]string{}, testBucketInterval, statsChan) + + now := time.Now().UnixNano() + alignedNow := now - now%c.bsize + + trace := pb.Trace{ + // first bucket + testSpan(1, 0, 2000, 0, "A1", "resource1", 0), + testSpan(2, 1, 1000, 0, "A2", "resource2", 0), + testSpan(3, 1, 1000, 0, "A2", "resource3", 0), + testSpan(4, 2, 40, 0, "A3", "resource4", 0), + testSpan(5, 4, 300, 0, "A3", "resource5", 0), + testSpan(6, 2, 30, 0, "A3", "resource6", 0), + } + traceutil.ComputeTopLevel(trace) + wt := agent.NewWeightedTrace(trace, traceutil.GetRoot(trace)) + + subtraces := ExtractTopLevelSubtraces(trace, traceutil.GetRoot(trace)) + sublayers := make(map[*pb.Span][]agent.SublayerValue) + for _, subtrace := range subtraces { + subtraceSublayers := agent.ComputeSublayers(subtrace.Trace) + sublayers[subtrace.Root] = subtraceSublayers + } + + testTrace := agent.ProcessedTrace{ + Env: "none", + Trace: trace, + WeightedTrace: wt, + Sublayers: sublayers, + } + + c.Add(testTrace) + stats := c.flushNow(alignedNow + int64(c.bufferLen)*c.bsize) + + if !assert.Equal(1, len(stats), "We should get exactly 1 StatsBucket") { + t.FailNow() + } + + assert.Equal(alignedNow, stats[0].Start) + + var receivedCounts map[string]agent.Count + + // Start with the first/older bucket + receivedCounts = stats[0].Counts + expectedCountValByKey := map[string]int64{ + "query|_sublayers.duration.by_service|env:none,resource:resource1,service:A1,sublayer_service:A1": 2000, + "query|_sublayers.duration.by_service|env:none,resource:resource1,service:A1,sublayer_service:A2": 2000, + "query|_sublayers.duration.by_service|env:none,resource:resource1,service:A1,sublayer_service:A3": 370, + "query|_sublayers.duration.by_service|env:none,resource:resource4,service:A3,sublayer_service:A3": 340, + "query|_sublayers.duration.by_service|env:none,resource:resource2,service:A2,sublayer_service:A2": 1000, + "query|_sublayers.duration.by_service|env:none,resource:resource2,service:A2,sublayer_service:A3": 370, + "query|_sublayers.duration.by_type|env:none,resource:resource1,service:A1,sublayer_type:db": 4370, + "query|_sublayers.duration.by_type|env:none,resource:resource2,service:A2,sublayer_type:db": 1370, + "query|_sublayers.duration.by_type|env:none,resource:resource4,service:A3,sublayer_type:db": 340, + "query|_sublayers.span_count|env:none,resource:resource1,service:A1,:": 6, + "query|_sublayers.span_count|env:none,resource:resource2,service:A2,:": 4, + "query|_sublayers.span_count|env:none,resource:resource4,service:A3,:": 2, + "query|duration|env:none,resource:resource1,service:A1": 2000, + "query|duration|env:none,resource:resource2,service:A2": 1000, + "query|duration|env:none,resource:resource3,service:A2": 1000, + "query|duration|env:none,resource:resource4,service:A3": 40, + "query|duration|env:none,resource:resource6,service:A3": 30, + "query|errors|env:none,resource:resource1,service:A1": 0, + "query|errors|env:none,resource:resource2,service:A2": 0, + "query|errors|env:none,resource:resource3,service:A2": 0, + "query|errors|env:none,resource:resource4,service:A3": 0, + "query|errors|env:none,resource:resource6,service:A3": 0, + "query|hits|env:none,resource:resource1,service:A1": 1, + "query|hits|env:none,resource:resource2,service:A2": 1, + "query|hits|env:none,resource:resource3,service:A2": 1, + "query|hits|env:none,resource:resource4,service:A3": 1, + "query|hits|env:none,resource:resource6,service:A3": 1, + } + + // verify we got all counts + assert.Equal(len(expectedCountValByKey), len(receivedCounts), "GOT %v", receivedCounts) + // verify values + for key, val := range expectedCountValByKey { + count, ok := receivedCounts[key] + assert.True(ok, "%s was expected from concentrator", key) + assert.Equal(val, int64(count.Value), "Wrong value for count %s", key) + } +} diff --git a/cmd/trace-agent/log.go b/cmd/trace-agent/log.go new file mode 100644 index 0000000000000..4f982b0429d73 --- /dev/null +++ b/cmd/trace-agent/log.go @@ -0,0 +1,225 @@ +package main + +import ( + "fmt" + "strconv" + "time" + + log "github.com/cihub/seelog" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +const agentLoggerConfigFmt = ` + + + + + + + + + + + + + + +` + +const rawLoggerConfigFmt = ` + + + + + + + + + +` + +const rawLoggerNoFmtConfigFmt = ` + + + + + + + + + +` + +// forwardLogMsg forwards the given message to the given logger making +// sure the log level is kept. +func forwardLogMsg(logger log.LoggerInterface, msg string, lvl log.LogLevel) { + switch lvl { + case log.TraceLvl: + logger.Trace(msg) + case log.DebugLvl: + logger.Debug(msg) + case log.InfoLvl: + logger.Info(msg) + case log.WarnLvl: + logger.Warn(msg) + case log.ErrorLvl: + logger.Error(msg) + case log.CriticalLvl: + logger.Critical(msg) + } +} + +// ThrottledReceiver is a custom seelog receiver dropping log messages +// once the maximum number of log messages per interval have been +// reached. +// NOTE: we don't need to protect our log counter with a +// mutex. Seelog's default logger type is the asynchronous loop +// logger, implemented as a goroutine processing logs independently +// from where they were emitted +// (https://github.com/cihub/seelog/wiki/Logger-types). +type ThrottledReceiver struct { + maxLogsPerInterval int64 + + rawLogger log.LoggerInterface + rawLoggerNoFmt log.LoggerInterface + + logCount int64 + tick <-chan time.Time + done chan struct{} +} + +// ReceiveMessage implements log.CustomReceiver +func (r *ThrottledReceiver) ReceiveMessage(msg string, lvl log.LogLevel, _ log.LogContextInterface) error { + r.logCount++ + + if r.maxLogsPerInterval < 0 || r.logCount < r.maxLogsPerInterval { + forwardLogMsg(r.rawLoggerNoFmt, msg, lvl) + } else if r.logCount == r.maxLogsPerInterval { + r.rawLogger.Error("Too many messages to log, skipping for a bit...") + } + return nil +} + +// AfterParse implements log.CustomReceiver +func (r *ThrottledReceiver) AfterParse(args log.CustomReceiverInitArgs) error { + // Parse the maxLogs attribute (no verification needed, its an + // integer for sure) + interval, _ := strconv.Atoi(args.XmlCustomAttrs["interval"]) + + // Parse the maxLogs attribute (no verification needed, its an + // integer for sure) + maxLogsPerInterval, _ := strconv.Atoi( + args.XmlCustomAttrs["max-per-interval"], + ) + + // Parse the logFilePath attribute + logFilePath := args.XmlCustomAttrs["file-path"] + + // Setup rawLogger + rawLoggerConfig := fmt.Sprintf(rawLoggerConfigFmt, logFilePath) + rawLogger, err := log.LoggerFromConfigAsString(rawLoggerConfig) + if err != nil { + return err + } + + // Setup rawLoggerNoFmt + rawLoggerNoFmtConfig := fmt.Sprintf(rawLoggerNoFmtConfigFmt, logFilePath) + rawLoggerNoFmt, err := log.LoggerFromConfigAsString(rawLoggerNoFmtConfig) + if err != nil { + return err + } + + // Setup the ThrottledReceiver + r.maxLogsPerInterval = int64(maxLogsPerInterval) + r.rawLogger = rawLogger + r.rawLoggerNoFmt = rawLoggerNoFmt + r.done = make(chan struct{}) + + // If no interval was given, no need to continue setup + if interval <= 0 { + r.maxLogsPerInterval = -1 + return nil + } + + r.logCount = 0 + r.tick = time.Tick(time.Duration(interval)) + + // Start the goroutine resetting the log count + go func() { + defer watchdog.LogOnPanic() + for { + select { + case <-r.tick: + r.logCount = 0 + case <-r.done: + return + } + } + + }() + + return nil +} + +// Flush implements log.CustomReceiver +func (r *ThrottledReceiver) Flush() { + // Flush all raw loggers, a typical use cases for log is showing an error at startup + // (eg: "cannot listen on localhost:8126: listen tcp 127.0.0.1:8126: bind: address already in use") + // and those are not shown if we don't Flush for real. + if r.rawLogger != nil { // set by AfterParse, so double-checking it's not nil + r.rawLogger.Flush() + } + if r.rawLoggerNoFmt != nil { // set by AfterParse, so double-checking it's not nil + r.rawLoggerNoFmt.Flush() + } +} + +// Close implements log.CustomReceiver +func (r *ThrottledReceiver) Close() error { + // Stop the go routine periodically resetting the log count + close(r.done) + return nil +} + +// SetupLogger sets up the agent's logger. We use seelog for logging +// in the following way: +// * Logs with a level under "minLogLvl" are dropped. +// * Logs with a level of "trace", "debug" and "info" are always +// showed if "minLogLvl" is set accordingly. This is for development +// purposes. +// * Logs with a level of "warn" or "error" are dropped after +// "logsDropMaxPerInterval" number of messages are showed. The +// counter is reset every "logsDropInterval". If "logsDropInterval" +// is 0, dropping is disabled (and might flood your logs!). +func SetupLogger(minLogLvl log.LogLevel, logFilePath string, logsDropInterval time.Duration, logsDropMaxPerInterval int) error { + log.RegisterReceiver("throttled", &ThrottledReceiver{}) + + // Build our config string + logConfig := fmt.Sprintf( + agentLoggerConfigFmt, + minLogLvl, + logsDropInterval, + logsDropMaxPerInterval, + logFilePath, + ) + + logger, err := log.LoggerFromConfigAsString(logConfig) + if err != nil { + return err + } + return log.ReplaceLogger(logger) +} + +// SetupDefaultLogger sets up a default logger for the agent, showing +// all log messages and with no throttling. +func SetupDefaultLogger() error { + logConfig := fmt.Sprintf(rawLoggerConfigFmt, config.DefaultLogFilePath) + + logger, err := log.LoggerFromConfigAsString(logConfig) + if err != nil { + return err + } + return log.ReplaceLogger(logger) +} diff --git a/cmd/trace-agent/main.go b/cmd/trace-agent/main.go new file mode 100644 index 0000000000000..4a6b2690b79a4 --- /dev/null +++ b/cmd/trace-agent/main.go @@ -0,0 +1,179 @@ +package main + +import ( + "context" + "fmt" + "math/rand" + "os" + "os/signal" + "runtime" + "runtime/pprof" + "strings" + "syscall" + "time" + + _ "net/http/pprof" + + log "github.com/cihub/seelog" + + "github.com/DataDog/datadog-agent/pkg/pidfile" + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/flags" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/osutil" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +// handleSignal closes a channel to exit cleanly from routines +func handleSignal(onSignal func()) { + sigChan := make(chan os.Signal, 10) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGPIPE) + for signo := range sigChan { + switch signo { + case syscall.SIGINT, syscall.SIGTERM: + log.Infof("received signal %d (%v)", signo, signo) + onSignal() + return + case syscall.SIGPIPE: + // By default systemd redirects the stdout to journald. When journald is stopped or crashes we receive a SIGPIPE signal. + // Go ignores SIGPIPE signals unless it is when stdout or stdout is closed, in this case the agent is stopped. + // We never want the agent to stop upon receiving SIGPIPE, so we intercept the SIGPIPE signals and just discard them. + default: + log.Warnf("unhandled signal %d (%v)", signo, signo) + } + } +} + +const agentDisabledMessage = `trace-agent not enabled. +Set env var DD_APM_ENABLED=true or add +apm_enabled: true +to your datadog.conf file. +Exiting.` + +// runAgent is the entrypoint of our code +func runAgent(ctx context.Context) { + // configure a default logger before anything so we can observe initialization + if flags.Info || flags.Version { + log.UseLogger(log.Disabled) + } else { + SetupDefaultLogger() + defer log.Flush() + } + + defer watchdog.LogOnPanic() + + // start CPU profiling + if flags.CPUProfile != "" { + f, err := os.Create(flags.CPUProfile) + if err != nil { + log.Critical(err) + } + pprof.StartCPUProfile(f) + log.Info("CPU profiling started...") + defer pprof.StopCPUProfile() + } + + if flags.Version { + fmt.Print(info.VersionString()) + return + } + + if !flags.Info && flags.PIDFilePath != "" { + err := pidfile.WritePID(flags.PIDFilePath) + if err != nil { + log.Errorf("Error while writing PID file, exiting: %v", err) + os.Exit(1) + } + + log.Infof("pid '%d' written to pid file '%s'", os.Getpid(), flags.PIDFilePath) + defer func() { + // remove pidfile if set + os.Remove(flags.PIDFilePath) + }() + } + + cfg, err := config.Load(flags.ConfigPath) + if err != nil { + osutil.Exitf("%v", err) + } + err = info.InitInfo(cfg) // for expvar & -info option + if err != nil { + panic(err) + } + + if flags.Info { + if err := info.Info(os.Stdout, cfg); err != nil { + os.Stdout.WriteString(fmt.Sprintf("failed to print info: %s\n", err)) + os.Exit(1) + } + return + } + + // Exit if tracing is not enabled + if !cfg.Enabled { + log.Info(agentDisabledMessage) + + // a sleep is necessary to ensure that supervisor registers this process as "STARTED" + // If the exit is "too quick", we enter a BACKOFF->FATAL loop even though this is an expected exit + // http://supervisord.org/subprocess.html#process-states + time.Sleep(5 * time.Second) + return + } + + // Initialize logging (replacing the default logger). No need + // to defer log.Flush, it was already done when calling + // "SetupDefaultLogger" earlier. + cfgLogLevel := strings.ToLower(cfg.LogLevel) + if cfgLogLevel == "warning" { + // to match core agent: + // https://github.com/DataDog/datadog-agent/blob/6f2d901aeb19f0c0a4e09f149c7cc5a084d2f708/pkg/config/log.go#L74-L76 + cfgLogLevel = "warn" + } + logLevel, ok := log.LogLevelFromString(cfgLogLevel) + if !ok { + logLevel = log.InfoLvl + } + duration := 10 * time.Second + if !cfg.LogThrottlingEnabled { + duration = 0 + } + err = SetupLogger(logLevel, cfg.LogFilePath, duration, 10) + if err != nil { + osutil.Exitf("cannot create logger: %v", err) + } + + // Initialize dogstatsd client + err = metrics.Configure(cfg, []string{"version:" + info.Version}) + if err != nil { + osutil.Exitf("cannot configure dogstatsd: %v", err) + } + + // count the number of times the agent started + metrics.Count("datadog.trace_agent.started", 1, nil, 1) + + // Seed rand + rand.Seed(time.Now().UTC().UnixNano()) + + agent := NewAgent(ctx, cfg) + + log.Infof("trace-agent running on host %s", cfg.Hostname) + agent.Run() + + // collect memory profile + if flags.MemProfile != "" { + f, err := os.Create(flags.MemProfile) + if err != nil { + log.Critical("could not create memory profile: ", err) + } + + // get up-to-date statistics + runtime.GC() + // Not using WriteHeapProfile but instead calling WriteTo to + // make sure we pass debug=1 and resolve pointers to names. + if err := pprof.Lookup("heap").WriteTo(f, 1); err != nil { + log.Critical("could not write memory profile: ", err) + } + f.Close() + } +} diff --git a/cmd/trace-agent/main_nix.go b/cmd/trace-agent/main_nix.go new file mode 100644 index 0000000000000..faad54f11dbe8 --- /dev/null +++ b/cmd/trace-agent/main_nix.go @@ -0,0 +1,23 @@ +// +build !windows + +package main + +import ( + "context" + _ "net/http/pprof" + + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +// main is the main application entry point +func main() { + ctx, cancelFunc := context.WithCancel(context.Background()) + + // Handle stops properly + go func() { + defer watchdog.LogOnPanic() + handleSignal(cancelFunc) + }() + + runAgent(ctx) +} diff --git a/cmd/trace-agent/main_windows.go b/cmd/trace-agent/main_windows.go new file mode 100644 index 0000000000000..4c3f3a1eb357e --- /dev/null +++ b/cmd/trace-agent/main_windows.go @@ -0,0 +1,294 @@ +// +build windows + +package main + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + _ "net/http/pprof" + + "github.com/DataDog/datadog-agent/pkg/trace/flags" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + + log "github.com/cihub/seelog" + "golang.org/x/sys/windows/svc" + "golang.org/x/sys/windows/svc/debug" + "golang.org/x/sys/windows/svc/eventlog" + "golang.org/x/sys/windows/svc/mgr" +) + +var elog debug.Log + +const ServiceName = "datadog-trace-agent" + +type myservice struct{} + +func (m *myservice) Execute(args []string, r <-chan svc.ChangeRequest, changes chan<- svc.Status) (ssec bool, errno uint32) { + const cmdsAccepted = svc.AcceptStop | svc.AcceptShutdown + changes <- svc.Status{State: svc.StartPending} + changes <- svc.Status{State: svc.Running, Accepts: cmdsAccepted} + + ctx, cancelFunc := context.WithCancel(context.Background()) + + go func() { + for { + select { + case c := <-r: + switch c.Cmd { + case svc.Interrogate: + changes <- c.CurrentStatus + // Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4 + time.Sleep(100 * time.Millisecond) + changes <- c.CurrentStatus + case svc.Stop, svc.Shutdown: + elog.Info(0x40000006, ServiceName) + changes <- svc.Status{State: svc.StopPending} + cancelFunc() + return + default: + elog.Warning(0xc000000A, string(c.Cmd)) + } + } + } + }() + elog.Info(0x40000003, ServiceName) + runAgent(ctx) + + changes <- svc.Status{State: svc.Stopped} + return +} + +func runService(isDebug bool) { + var err error + if isDebug { + elog = debug.New(ServiceName) + } else { + elog, err = eventlog.Open(ServiceName) + if err != nil { + return + } + } + defer elog.Close() + + run := svc.Run + if isDebug { + run = debug.Run + } + elog.Info(0x40000007, ServiceName) + err = run(ServiceName, &myservice{}) + if err != nil { + elog.Error(0xc0000008, err.Error()) + return + } + elog.Info(0x40000004, ServiceName) +} + +// main is the main application entry point +func main() { + isIntSess, err := svc.IsAnInteractiveSession() + if err != nil { + fmt.Printf("failed to determine if we are running in an interactive session: %v", err) + } + if !isIntSess { + runService(false) + return + } + defer log.Flush() + // sigh. Go doesn't have boolean xor operator. The options are mutually exclusive, + // make sure more than one wasn't specified + optcount := 0 + if flags.Win.InstallService { + optcount++ + } + if flags.Win.UninstallService { + optcount++ + } + if flags.Win.StartService { + optcount++ + } + if flags.Win.StopService { + optcount++ + } + if optcount > 1 { + fmt.Printf("Incompatible options chosen") + return + } + if flags.Win.InstallService { + if err = installService(); err != nil { + fmt.Printf("Error installing service %v\n", err) + } + return + } + if flags.Win.UninstallService { + if err = removeService(); err != nil { + fmt.Printf("Error removing service %v\n", err) + } + return + } + if flags.Win.StartService { + if err = startService(); err != nil { + fmt.Printf("Error starting service %v\n", err) + } + return + } + if flags.Win.StopService { + if err = stopService(); err != nil { + fmt.Printf("Error stopping service %v\n", err) + } + return + + } + + // if we are an interactive session, then just invoke the agent on the command line. + + ctx, cancelFunc := context.WithCancel(context.Background()) + // Handle stops properly + go func() { + defer watchdog.LogOnPanic() + handleSignal(cancelFunc) + }() + + // Invoke the Agent + runAgent(ctx) +} + +func startService() error { + m, err := mgr.Connect() + if err != nil { + return err + } + defer m.Disconnect() + s, err := m.OpenService(ServiceName) + if err != nil { + return fmt.Errorf("could not access service: %v", err) + } + defer s.Close() + err = s.Start("is", "manual-started") + if err != nil { + return fmt.Errorf("could not start service: %v", err) + } + return nil +} + +func stopService() error { + return controlService(svc.Stop, svc.Stopped) +} + +func restartService() error { + var err error + if err = stopService(); err == nil { + err = startService() + } + return err +} + +func controlService(c svc.Cmd, to svc.State) error { + m, err := mgr.Connect() + if err != nil { + return err + } + defer m.Disconnect() + s, err := m.OpenService(ServiceName) + if err != nil { + return fmt.Errorf("could not access service: %v", err) + } + defer s.Close() + status, err := s.Control(c) + if err != nil { + return fmt.Errorf("could not send control=%d: %v", c, err) + } + timeout := time.Now().Add(10 * time.Second) + for status.State != to { + if timeout.Before(time.Now()) { + return fmt.Errorf("timeout waiting for service to go to state=%d", to) + } + time.Sleep(300 * time.Millisecond) + status, err = s.Query() + if err != nil { + return fmt.Errorf("could not retrieve service status: %v", err) + } + } + return nil +} + +func installService() error { + exepath, err := exePath() + if err != nil { + return err + } + fmt.Printf("exepath: %s\n", exepath) + + m, err := mgr.Connect() + if err != nil { + return err + } + defer m.Disconnect() + s, err := m.OpenService(ServiceName) + if err == nil { + s.Close() + return fmt.Errorf("service %s already exists", ServiceName) + } + s, err = m.CreateService(ServiceName, exepath, mgr.Config{DisplayName: "Datadog Agent Service"}) + if err != nil { + return err + } + defer s.Close() + err = eventlog.InstallAsEventCreate(ServiceName, eventlog.Error|eventlog.Warning|eventlog.Info) + if err != nil { + s.Delete() + return fmt.Errorf("SetupEventLogSource() failed: %s", err) + } + return nil +} + +func exePath() (string, error) { + prog := os.Args[0] + p, err := filepath.Abs(prog) + if err != nil { + return "", err + } + fi, err := os.Stat(p) + if err == nil { + if !fi.Mode().IsDir() { + return p, nil + } + err = fmt.Errorf("%s is directory", p) + } + if filepath.Ext(p) == "" { + p += ".exe" + fi, err := os.Stat(p) + if err == nil { + if !fi.Mode().IsDir() { + return p, nil + } + err = fmt.Errorf("%s is directory", p) + } + } + return "", err +} + +func removeService() error { + m, err := mgr.Connect() + if err != nil { + return err + } + defer m.Disconnect() + s, err := m.OpenService(ServiceName) + if err != nil { + return fmt.Errorf("service %s is not installed", ServiceName) + } + defer s.Close() + err = s.Delete() + if err != nil { + return err + } + err = eventlog.Remove(ServiceName) + if err != nil { + return fmt.Errorf("RemoveEventLogSource() failed: %s", err) + } + return nil +} diff --git a/cmd/trace-agent/model_test.go b/cmd/trace-agent/model_test.go new file mode 100644 index 0000000000000..19e75de99cf4a --- /dev/null +++ b/cmd/trace-agent/model_test.go @@ -0,0 +1,33 @@ +// Some benchmarks defined here because it both requires fixtures & model +// and putting them in model would cause a circular dependency. + +package main + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" +) + +const ( + defaultEnv = "dev" +) + +func BenchmarkHandleSpanRandom(b *testing.B) { + sb := agent.NewStatsRawBucket(0, 1e9) + aggr := []string{} + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + trace := testutil.RandomTrace(10, 8) + root := traceutil.GetRoot(trace) + traceutil.ComputeTopLevel(trace) + wt := agent.NewWeightedTrace(trace, root) + for _, span := range wt { + sb.HandleSpan(span, defaultEnv, aggr, nil) + } + } +} diff --git a/cmd/trace-agent/sampler.go b/cmd/trace-agent/sampler.go new file mode 100644 index 0000000000000..7c1fada96906d --- /dev/null +++ b/cmd/trace-agent/sampler.go @@ -0,0 +1,120 @@ +package main + +import ( + "fmt" + "reflect" + "sync/atomic" + "time" + + log "github.com/cihub/seelog" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +// Sampler chooses wich spans to write to the API +type Sampler struct { + // For stats + keptTraceCount uint64 + totalTraceCount uint64 + + lastFlush time.Time + + // actual implementation of the sampling logic + engine sampler.Engine +} + +// NewScoreSampler creates a new empty sampler ready to be started +func NewScoreSampler(conf *config.AgentConfig) *Sampler { + return &Sampler{ + engine: sampler.NewScoreEngine(conf.ExtraSampleRate, conf.MaxTPS), + } +} + +// NewErrorsSampler creates a new sampler dedicated to traces containing errors +// to isolate them from the global max tps. It behaves exactly like the normal +// ScoreSampler except that its statistics are reported under a different name. +func NewErrorsSampler(conf *config.AgentConfig) *Sampler { + return &Sampler{ + engine: sampler.NewErrorsEngine(conf.ExtraSampleRate, conf.MaxTPS), + } +} + +// NewPrioritySampler creates a new empty distributed sampler ready to be started +func NewPrioritySampler(conf *config.AgentConfig, dynConf *sampler.DynamicConfig) *Sampler { + return &Sampler{ + engine: sampler.NewPriorityEngine(conf.ExtraSampleRate, conf.MaxTPS, &dynConf.RateByService), + } +} + +// Run starts sampling traces +func (s *Sampler) Run() { + go func() { + defer watchdog.LogOnPanic() + s.engine.Run() + }() + + go func() { + defer watchdog.LogOnPanic() + s.logStats() + }() +} + +// Add samples a trace and returns true if trace was sampled (should be kept), false otherwise +func (s *Sampler) Add(t agent.ProcessedTrace) (sampled bool, rate float64) { + atomic.AddUint64(&s.totalTraceCount, 1) + sampled, rate = s.engine.Sample(t.Trace, t.Root, t.Env) + if sampled { + atomic.AddUint64(&s.keptTraceCount, 1) + } + return sampled, rate +} + +// Stop stops the sampler +func (s *Sampler) Stop() { + s.engine.Stop() +} + +// logStats reports statistics and update the info exposed. +func (s *Sampler) logStats() { + for now := range time.Tick(10 * time.Second) { + keptTraceCount := atomic.SwapUint64(&s.keptTraceCount, 0) + totalTraceCount := atomic.SwapUint64(&s.totalTraceCount, 0) + + duration := now.Sub(s.lastFlush) + s.lastFlush = now + + // TODO: do we still want that? figure out how it conflicts with what the `state` exposes / what is public metrics. + var stats info.SamplerStats + if duration > 0 { + stats.KeptTPS = float64(keptTraceCount) / duration.Seconds() + stats.TotalTPS = float64(totalTraceCount) / duration.Seconds() + } + engineType := fmt.Sprint(reflect.TypeOf(s.engine)) + log.Debugf("%s: flushed %d sampled traces out of %d", engineType, keptTraceCount, totalTraceCount) + + state := s.engine.GetState() + + switch state := state.(type) { + case sampler.InternalState: + log.Debugf("%s: inTPS: %f, outTPS: %f, maxTPS: %f, offset: %f, slope: %f, cardinality: %d", + engineType, state.InTPS, state.OutTPS, state.MaxTPS, state.Offset, state.Slope, state.Cardinality) + + // publish through expvar + // TODO: avoid type switch, prefer engine method + switch s.engine.GetType() { + case sampler.NormalScoreEngineType: + info.UpdateSamplerInfo(info.SamplerInfo{Stats: stats, State: state}) + case sampler.ErrorsScoreEngineType: + info.UpdateErrorsSamplerInfo(info.SamplerInfo{Stats: stats, State: state}) + case sampler.PriorityEngineType: + info.UpdatePrioritySamplerInfo(info.SamplerInfo{Stats: stats, State: state}) + } + default: + log.Debugf("unhandled sampler engine, can't log state") + } + } +} diff --git a/cmd/trace-agent/service_mapper.go b/cmd/trace-agent/service_mapper.go new file mode 100644 index 0000000000000..3024fe949c2be --- /dev/null +++ b/cmd/trace-agent/service_mapper.go @@ -0,0 +1,114 @@ +package main + +import ( + "sync" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + log "github.com/cihub/seelog" +) + +// serviceApp represents the app to which certain integration belongs to +const serviceApp = "app" + +// ServiceMapper provides a cache layer over model.ServicesMetadata pipeline +// Used in conjunction with ServiceWriter: in-> ServiceMapper out-> ServiceWriter +type ServiceMapper struct { + in <-chan pb.ServicesMetadata + out chan<- pb.ServicesMetadata + exit chan bool + done sync.WaitGroup + cache pb.ServicesMetadata +} + +// NewServiceMapper returns an instance of ServiceMapper with the provided channels +func NewServiceMapper(in <-chan pb.ServicesMetadata, out chan<- pb.ServicesMetadata) *ServiceMapper { + return &ServiceMapper{ + in: in, + out: out, + exit: make(chan bool), + cache: make(pb.ServicesMetadata), + } +} + +// Start runs the event loop in a non-blocking way +func (s *ServiceMapper) Start() { + s.done.Add(1) + + go func() { + defer watchdog.LogOnPanic() + s.Run() + s.done.Done() + }() +} + +// Stop gracefully terminates the event-loop +func (s *ServiceMapper) Stop() { + close(s.exit) + s.done.Wait() +} + +// Run triggers the event-loop that consumes model.ServicesMeta +func (s *ServiceMapper) Run() { + telemetryTicker := time.NewTicker(1 * time.Minute) + defer telemetryTicker.Stop() + + for { + select { + case metadata := <-s.in: + s.update(metadata) + case <-telemetryTicker.C: + log.Infof("total number of tracked services: %d", len(s.cache)) + case <-s.exit: + return + } + } +} + +func (s *ServiceMapper) update(metadata pb.ServicesMetadata) { + var changes pb.ServicesMetadata + + for k, v := range metadata { + if !s.shouldAdd(k, metadata) { + continue + } + + // We do this inside the for loop to avoid unecessary memory allocations. + // After few method executions, the cache will be warmed up and this section be skipped altogether. + if changes == nil { + changes = make(pb.ServicesMetadata) + } + + changes[k] = v + } + + if changes == nil { + return + } + + s.out <- changes + + for k, v := range changes { + s.cache[k] = v + } +} + +func (s *ServiceMapper) shouldAdd(service string, metadata pb.ServicesMetadata) bool { + cacheEntry, ok := s.cache[service] + + // No cache entry? + if !ok { + return true + } + + // Cache entry came from service API? + if _, ok = cacheEntry[serviceApp]; ok { + return false + } + + // New metadata value came from service API? + _, ok = metadata[service][serviceApp] + + return ok +} diff --git a/cmd/trace-agent/service_mapper_test.go b/cmd/trace-agent/service_mapper_test.go new file mode 100644 index 0000000000000..9485cc4608b19 --- /dev/null +++ b/cmd/trace-agent/service_mapper_test.go @@ -0,0 +1,72 @@ +package main + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func TestServiceMapper(t *testing.T) { + assert := assert.New(t) + + mapper, in, out := testMapper() + mapper.Start() + defer mapper.Stop() + + input := pb.ServicesMetadata{"service-a": {"app_type": "type-a"}} + in <- input + output := <-out + + // When the service is ingested for the first time, we simply propagate it + // to the output channel and add an entry to the cache map + assert.Equal(input, output) + + // This entry will result in a cache-hit and therefore will be filtered out + in <- pb.ServicesMetadata{"service-a": {"app_type": "SOMETHING_DIFFERENT"}} + + // This represents a new service and thus will be cached and propagated to the outbound channel + newService := pb.ServicesMetadata{"service-b": {"app_type": "type-b"}} + in <- newService + output = <-out + + assert.Equal(newService, output) +} + +func TestCachePolicy(t *testing.T) { + assert := assert.New(t) + + mapper, in, out := testMapper() + mapper.Start() + defer mapper.Stop() + + input := pb.ServicesMetadata{"service-a": {"app_type": "type-a"}} + in <- input + output := <-out + + // A new service entry should propagate the metadata the the outbound channel + assert.Equal(input, output) + + // A service entry that is already in cache should only be propagated IF: + // - Current version does NOT have "app" + // - New version DOES have "app" + + // This first attempt won't be propagated to the writer + firstAttempt := pb.ServicesMetadata{"service-a": {"app_type": "FIRST_ATTEMPT"}} + in <- firstAttempt + + // But this second will + secondAttempt := pb.ServicesMetadata{"service-a": {"app_type": "SECOND_ATTEMPT", "app": "app-a"}} + in <- secondAttempt + + output = <-out + assert.Equal(secondAttempt, output) +} + +func testMapper() (mapper *ServiceMapper, in, out chan pb.ServicesMetadata) { + in = make(chan pb.ServicesMetadata, 1) + out = make(chan pb.ServicesMetadata, 1) + mapper = NewServiceMapper(in, out) + + return mapper, in, out +} diff --git a/cmd/trace-agent/sublayers.go b/cmd/trace-agent/sublayers.go new file mode 100644 index 0000000000000..ff6cc9939f593 --- /dev/null +++ b/cmd/trace-agent/sublayers.go @@ -0,0 +1,91 @@ +package main + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + log "github.com/cihub/seelog" +) + +// Subtrace represents the combination of a root span and the trace consisting of all its descendant spans +type Subtrace struct { + Root *pb.Span + Trace pb.Trace +} + +// spanAndAncestors is used by ExtractTopLevelSubtraces to store the pair of a span and its ancestors +type spanAndAncestors struct { + Span *pb.Span + Ancestors []*pb.Span +} + +// element and queue implement a very basic LIFO used to do an iterative DFS on a trace +type element struct { + SpanAndAncestors *spanAndAncestors + Next *element +} + +type stack struct { + head *element +} + +func (s *stack) Push(value *spanAndAncestors) { + e := &element{value, nil} + if s.head == nil { + s.head = e + return + } + e.Next = s.head + s.head = e +} + +func (s *stack) Pop() *spanAndAncestors { + if s.head == nil { + return nil + } + value := s.head.SpanAndAncestors + s.head = s.head.Next + return value +} + +// ExtractTopLevelSubtraces extracts all subtraces rooted in a toplevel span, +// ComputeTopLevel should be called before. +func ExtractTopLevelSubtraces(t pb.Trace, root *pb.Span) []Subtrace { + if root == nil { + return []Subtrace{} + } + childrenMap := traceutil.ChildrenMap(t) + subtraces := []Subtrace{} + + visited := make(map[*pb.Span]bool, len(t)) + subtracesMap := make(map[*pb.Span][]*pb.Span) + var next stack + next.Push(&spanAndAncestors{root, []*pb.Span{}}) + + // We do a DFS on the trace to record the toplevel ancesters of each span + for current := next.Pop(); current != nil; current = next.Pop() { + // We do not extract subtraces for toplevel spans that have no children + // since these are not interresting + if traceutil.HasTopLevel(current.Span) && len(childrenMap[current.Span.SpanID]) > 0 { + current.Ancestors = append(current.Ancestors, current.Span) + } + visited[current.Span] = true + for _, ancestor := range current.Ancestors { + subtracesMap[ancestor] = append(subtracesMap[ancestor], current.Span) + } + for _, child := range childrenMap[current.Span.SpanID] { + // Continue if this span has already been explored (meaning the + // trace is not a Tree) + if visited[child] { + log.Warnf("Found a cycle while processing traceID:%v, trace should be a tree", t[0].TraceID) + continue + } + next.Push(&spanAndAncestors{child, current.Ancestors}) + } + } + + for topLevel, subtrace := range subtracesMap { + subtraces = append(subtraces, Subtrace{topLevel, subtrace}) + } + + return subtraces +} diff --git a/cmd/trace-agent/sublayers_test.go b/cmd/trace-agent/sublayers_test.go new file mode 100644 index 0000000000000..423daedff07f8 --- /dev/null +++ b/cmd/trace-agent/sublayers_test.go @@ -0,0 +1,99 @@ +package main + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/stretchr/testify/assert" +) + +func TestExtractTopLevelSubtracesWithSimpleTrace(t *testing.T) { + assert := assert.New(t) + + trace := pb.Trace{ + &pb.Span{SpanID: 1, ParentID: 0, Service: "s1"}, + &pb.Span{SpanID: 2, ParentID: 1, Service: "s2"}, + &pb.Span{SpanID: 3, ParentID: 2, Service: "s2"}, + &pb.Span{SpanID: 4, ParentID: 3, Service: "s2"}, + &pb.Span{SpanID: 5, ParentID: 1, Service: "s1"}, + } + + expected := []Subtrace{ + Subtrace{trace[0], trace}, + Subtrace{trace[1], []*pb.Span{trace[1], trace[2], trace[3]}}, + } + + traceutil.ComputeTopLevel(trace) + subtraces := ExtractTopLevelSubtraces(trace, trace[0]) + + assert.Equal(len(expected), len(subtraces)) + + subtracesMap := make(map[*pb.Span]Subtrace) + for _, s := range subtraces { + subtracesMap[s.Root] = s + } + + for _, s := range expected { + assert.ElementsMatch(s.Trace, subtracesMap[s.Root].Trace) + } +} + +func TestExtractTopLevelSubtracesShouldIgnoreLeafTopLevel(t *testing.T) { + assert := assert.New(t) + + trace := pb.Trace{ + &pb.Span{SpanID: 1, ParentID: 0, Service: "s1"}, + &pb.Span{SpanID: 2, ParentID: 1, Service: "s2"}, + &pb.Span{SpanID: 3, ParentID: 2, Service: "s2"}, + &pb.Span{SpanID: 4, ParentID: 1, Service: "s3"}, + } + + expected := []Subtrace{ + Subtrace{trace[0], trace}, + Subtrace{trace[1], []*pb.Span{trace[1], trace[2]}}, + } + + traceutil.ComputeTopLevel(trace) + subtraces := ExtractTopLevelSubtraces(trace, trace[0]) + + assert.Equal(len(expected), len(subtraces)) + + subtracesMap := make(map[*pb.Span]Subtrace) + for _, s := range subtraces { + subtracesMap[s.Root] = s + } + + for _, s := range expected { + assert.ElementsMatch(s.Trace, subtracesMap[s.Root].Trace) + } +} + +func TestExtractTopLevelSubtracesWorksInSpiteOfCycles(t *testing.T) { + assert := assert.New(t) + + trace := pb.Trace{ + &pb.Span{SpanID: 1, ParentID: 3, Service: "s1"}, + &pb.Span{SpanID: 2, ParentID: 1, Service: "s2"}, + &pb.Span{SpanID: 3, ParentID: 2, Service: "s2"}, + } + + expected := []Subtrace{ + Subtrace{trace[0], trace}, + Subtrace{trace[1], []*pb.Span{trace[1], trace[2]}}, + } + + traceutil.ComputeTopLevel(trace) + subtraces := ExtractTopLevelSubtraces(trace, trace[0]) + + assert.Equal(len(expected), len(subtraces)) + + subtracesMap := make(map[*pb.Span]Subtrace) + for _, s := range subtraces { + subtracesMap[s.Root] = s + } + + for _, s := range expected { + assert.ElementsMatch(s.Trace, subtracesMap[s.Root].Trace) + } +} diff --git a/cmd/trace-agent/trace_service_extractor.go b/cmd/trace-agent/trace_service_extractor.go new file mode 100644 index 0000000000000..6acc7f3e059de --- /dev/null +++ b/cmd/trace-agent/trace_service_extractor.go @@ -0,0 +1,42 @@ +package main + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// appType is one of the pieces of information embedded in ServiceMetadata +const appType = "app_type" + +// TraceServiceExtractor extracts service metadata from top-level spans +type TraceServiceExtractor struct { + outServices chan<- pb.ServicesMetadata +} + +// NewTraceServiceExtractor returns a new TraceServiceExtractor +func NewTraceServiceExtractor(out chan<- pb.ServicesMetadata) *TraceServiceExtractor { + return &TraceServiceExtractor{out} +} + +// Process extracts service metadata from top-level spans and sends it downstream +func (ts *TraceServiceExtractor) Process(t agent.WeightedTrace) { + meta := make(pb.ServicesMetadata) + + for _, s := range t { + if !s.TopLevel { + continue + } + + if _, ok := meta[s.Service]; ok { + continue + } + + if v := s.Type; len(v) > 0 { + meta[s.Service] = map[string]string{appType: v} + } + } + + if len(meta) > 0 { + ts.outServices <- meta + } +} diff --git a/cmd/trace-agent/trace_service_extractor_test.go b/cmd/trace-agent/trace_service_extractor_test.go new file mode 100644 index 0000000000000..474e9d880a590 --- /dev/null +++ b/cmd/trace-agent/trace_service_extractor_test.go @@ -0,0 +1,40 @@ +package main + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/stretchr/testify/assert" +) + +func TestTracerServiceExtractor(t *testing.T) { + assert := assert.New(t) + + testChan := make(chan pb.ServicesMetadata) + testExtractor := NewTraceServiceExtractor(testChan) + + trace := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Service: "service-a", Type: "type-a"}, + &pb.Span{TraceID: 1, SpanID: 2, ParentID: 1, Service: "service-b", Type: "type-b"}, + &pb.Span{TraceID: 1, SpanID: 3, ParentID: 1, Service: "service-c", Type: "type-c"}, + &pb.Span{TraceID: 1, SpanID: 4, ParentID: 3, Service: "service-c", Type: "ignore"}, + } + + traceutil.ComputeTopLevel(trace) + wt := agent.NewWeightedTrace(trace, trace[0]) + + go func() { + testExtractor.Process(wt) + }() + + metadata := <-testChan + + // Result should only contain information derived from top-level spans + assert.Equal(metadata, pb.ServicesMetadata{ + "service-a": {"app_type": "type-a"}, + "service-b": {"app_type": "type-b"}, + "service-c": {"app_type": "type-c"}, + }) +} diff --git a/cmd/trace-agent/windows_resources/project.ico b/cmd/trace-agent/windows_resources/project.ico new file mode 100644 index 0000000000000000000000000000000000000000..21ef10ea0ad1b96dc5408798996cec17dadf516a GIT binary patch literal 204862 zcmeFaXOtC3(l$8n?%8wp&;HtTzVq#m@62fAKn4WDBWT(;-kbJ-guFDGk@ps8@4at? zK!EVVgKz>N5C{R15VFXN0VIU!?%Qsk$f$}Nx2kUUrGXjE_flM{BO{-PjEu_4%FN2l z`qQ8O3;*c&uRs0IKXv=d|M{o?0lFJA{`9B+gZUKnI@;9H-rg=D&=(bipj$+=F>z4} z9pJ0_9S(5>ulmQ~5J#{B{|FLZb8O~2m{=(tNOJAulh$kaJBq6T)}sW zgL!%HFIIk0XTD4qC2~>VtNJ^3*H!;qcK`4_C2A$eu9hGGOl!Vyn8`{sEmBwYYYzy0 z)jv{%#3%pa@+0Y96yY%PqGUq>U)A5Sm#+Hf68!@QcHuyxU62Z0I7A#MUeX-5GbNUz+(Ea9Xby!Cg&OS1)PT@wPgS1G&~<90rz%gT zmI|HrROQJGU8hERs`6xNsnBUpRi4byb!w!iDo>`C3Z3>;<;e`DQ-eVV3o_pk3!Nko zSVKAza>ND;GT#w4U2mv?#fqrZr5*iii-56pR+s7zeFkA}bh8Fj6pDP+$@Z4$~T@V?{&=MhZp<0fb0&S@5d< z5IxW_?5h7Smw(_%IiOS1!Ge;s27}gkchJFMC0c`sU)9gP5{Ec~SN-E~h$GlF|F9IR z5`~(nX$L@31mVztARirw10Wih)*OlQ)WFIG?

^(BKKsNjLxq^5L1fW@;KDTBoKt zN&u`}2LYC#)~RU+;HrM{0DI=5`~z9w`CwX~?HGUcuJID(q28vY}wKPECg*8wy72)O0Aa zpW1?C&U~RV-L^jgMw&C?=@3<-O*1tuNM=fu1VJ#(s)Pr1VtFt%Awf&Km}dymV4+QtoENankK#-;qb#gQ7(x`d^@|z7FyQk6q)k$K zpoJq!gkvFFTiaXP@M~*r<)52o0M4x~Ev?NhT7x7=Oi2-Bi=~!i5r;EdV+(+XLtEGp zZJQ^ykv7KxbtVfA45z|aGQ<8E3tL48S>v)tVnHx~F~m2vwel5+-a4)-&6SUH`75Y=RudHy`JjPq_wzig*mgbfgTRPxxb1Pj+zU3rBw1LC4 za6I&jm;s|~G@~qe9$a$ZC{qe7NT9bn)(;T|IPB3(O$Wn^bpYeieSPTTwJ(;u{bKRD z7mC+DZ+jKs}?xNHc^|$Jk4PvZ7jKL*$bI#pDTtLYhEl_`*PvB+2tSaejgVQ z7zX3~c6JqFVJA`qa^a(|Q5-?&VA%54av8ABWOl`q=_gP93Qvf4MVp9dj?rYaK_N<4 zTic??(#SI=oAA}<=L^0$`neQ?wON(xUMPWME%zGVg7*5AY@A)TfBwoNn|7UPXhfpE z=4NAqsXgqX!UnKOeq&5nqL5XeDf3P3?H{h$zPYe&*`sr-lKxSd@JL0%BRJQkKC$A_ zw08>U9ooEy{el|-HwkD_0+qq^OX?)i+TPTRblOk;c4q(b%^R|+Rv@k9zat^@1M#={ zvz}f3RNnU5w?5x{n3HR6B9RdEb5hn>@F29f<_Dua@u(V(u70m-WNKC1^cwT4j+@Ru z1ce3xK&AQBM5R_nB^TT>I-}Rv#gD%B`JqopGwy4DcG{bKO?y^st{r)&NL7xRx%7!S z&3;TUGyQy4NV=Pw55KdgCOk#VsE$p^>@^li+C`h4+Fm*7!J6S@C#4&knsMuJWff^Y z@AGXhU064EmgINd*n8Hdmm>)WlI`$hS|+x)xA>a2l`pOuJ*zI3yjMZ#rlt>>1SuGX zs|%uf>ccf7W@z6s?Rl<_Nv()VEf0^+x*@t~_>2wN^L*{CXMH~7woZI!o7&EK*}q|X zUDBhOHw-V2N~n%WsfkIc4o|8IPb`l}tcXagj7+KoCpxtvEFrCXTwUr@A8maP8d`kj zjw8#hT*@C?KQSn4#^tT8tqmp zk4!BcFeS5lT=kT{?|yTQzbZ6oX#Jjv3R4p-g)<+B0>NjSX9Yc@aMypjdRJ9MLglc8 zsxbOh+doZ%$+3VDa#%uz`PD>DFYYzIF6keqPW;;F^BI>&vhA6@MpuW$Llz?CVF~Fs zkMzN7E(^7_ZX*r6y<*(G)!~V38yOapvf|0i22|RCNtTYjuOd8|O|2T5kale};-Qn( zvS%7wnsd7k$8~2Ih`e4C$Y_^g{+zfdQfQLS9y(QAR|CUhXRI%*Z}QRI#7E7zd#i>e za#~`)Wua|@4qDoVl9TJ>W~ALTVqCJF4R5?~ngj8auOOa6)xN zlGuhMK)Zv?ERRgeA3V8i^j+Wo`19GOrU31Bak5h3>(Soch-NSzxU-Ar-ZT=qt_-Il zVYJ0dcKFgGBbfIF)@oYRy zWoso14uZ2F&a4beDhrRVj!Mn#HS&{PAEG&BE&>&~W}k24oYJb;>5~1r*wlB*7Wk+( z>*x^M7}VU{d~E;W(%UCkNs}|caLb>1)I4p7hns<;8-vYopY|t8*h7x3O&)RpYhxyAGv}N8LcfqvrL9K<-HCFQkqReAS!1;(b)8&QSbXlE zqt3RWkqDDLj!ykAGC;}!rlx}h{j%+CsQ*7%y{kMT0j?Gct0I!{Ag+o@#aVteF{w3i zsnt=*cA4?xNpMJFasNr5?EIj)i5}KS0uA$D&z#NdF#^72%@yGZg;7ajl@QuyO%njc zti65ytfy)ts8sW>Dl(}$@gFSiI8*)4<)2E9UH`+&=3s?Q%`IOY`K++dSW$?2vNq{a z#PSmv1RvTM&|qo~f_Z3bndDeib@cQl|CkMf>EU44lXc_ntw8y=jjs%guY)I*d*NsD zE&Wo^K7rYOu#rM3s)?PR6O(+NZ+Bya+d?>e*PAQL22Cmqk9TN5nXHLUp*-n(X7Bl^YWI&zj(a_y)nem#9Ar`IT42uX-46I)wF zbV^-RN-fd~kFSYLs)dT0<2Mz~+m`2dFPsc-uYaX@)nge|B3i~{IsUHn9PMbva|Ng@gOFf(eJNrDq$pmp}Q ze*(+D=%Fei|`h3lNJ^c6p^%SsdZf~EL_(W}l-Nry+ zi%6`Q^f1GW4DfIbibsjXSL_ zDyO8*IRhq_+S%bu)kV+9k4`+WVF#;i@HMw!U;=65YT7-NjX8Bi11HG^VnH<6iepkm zx5^G#Sbxtmbulw6q)CxFQ+UKoeCYFo9|M4q9dturC(=SnoYuSk#TIexuAapkWVXDPJ4UfxoY~SJB+eb_7L63v5 zq=NpFj=uj9uBlj@6_rwoMsk?B1u)uha~wtt9OE`L^i7FSQhbmTT3dfR^E<|HtjlWj zSI11>S+)3l;OH5Nn8M>-{_XaUrP`pOKkK$}=t}SvDGN)S+hZgp%4QHn8p$G~**dd| zmewD?`4N>Qnu07&4T(FZ?c1;&{*+Xp14lslNA~|p<=?1g-Et-{mxe}@ySkZAl$j=q zlr*GL9Ff8}5+l>-{{^3KOWA^&D0H>#dM1n-JY|y~1QbudxF+^KKlke-p zM|1m5!0iu*Ko-?VMMO&dj3<#0VcO76@a?KuUS(Gol*I7qTM8GT4nlkjjkG$|GhHa)Y`n}l@c1bz~Bt&PppYZ+`V#RvyaB5F}Tr@ zS?%qMAAALaht;-WsHigsPbCpoS}6ahPT77GFqD5D_-6geemKl_u&w0vyPvO)OEu<^ z2arnX9aBEu@j)xzxtIfml8L-Q)6f#5fhIO7GO1|fort0q9V7hO?XdZa*rukY6UTqZ z?L8JXm_Mz^r1=vcA`Lc-#g_g3rON0OfuUNf4olotv&3gEC4p_@884~VPk*dB+!nOk z|0?@bwpm1>U&?EuYib!ndhCO=+2cQSKMqZp%j2J}`kySv!{zjutR5rF!>sN}6$Zwq zP=(2zlgwf9Ieo~a95zgyGkcFN89EhBGHE_l#-+YfGQX*@$+m_5LKO~O|HC;}{coEM zfEIDm?d>`JCrT=G3q+?Zd1SUxY?44*4kLK)?k^oQO)ddkoz;8Psb5YvwxAXhS2=a- zFg%1;Z#B{A&igC&%o}4dR>1b*j0=V*SH_Jo)&3qjDvi{dJD);zWNt{UGJqyOxaMF^hT_Lm(0&^1@!!*u(XfQY&6aratmw(j%cKgSLH`f$ONP|g$&uO;~*DY0a`QKZ) zg<1kYwY4|5wXS-u0D~Li+loO`j=a0Cu^El5mP6b3mJOZ?aXfzmex_S0m!8Tq2I_Aa zdE%d^o2mF)nK4$dDP)V3f2yEl{6~2K7z$bDqbd6Wt{@wRwDNk6{_*>tF{AvrE&Ik1(i^w@W4wgqbooa@(*9wijOqRpGc5z#+8P=fch6g1Z@Ug2 zz)hFdcOvD2GJ@wI-N;K`%AXhIcW-rM>iqj&Aa~GzJF7%m7gQ{?k~2}iUO1P57g$5p zO&yi8chNfBUz8GRjOOM8E4NjtZf`|cVo~o2-+um8Gc|`zf_4TRX(1ST`FG121LXaa!hh~)a2Pgy#HnB`giomk7Rc1(=p=Y3d6vxe!mOsoZzhl}bJKjg`?D)?Aa`By)_&A&_*8(kpH7^x3+AU!u zxbEe`>gX9&+?7K@k%{Hg{B0yjhAgwK!(YxT+CP=5{eLCK|K;xRB*-ZHuxQl(KIiy9 z)&Jc7vB4eug+}!5KU(u{rFnzQWur7QseU>Qr$f*x;+O^y|45BJS^`)`&r!dhIfr|s z(bu#gyB6cy)QQkP*BycP3m?C?-x@om3dwmO#g51SwSZiqQjAW^_&mzzEi`2U2a3ef zr?P4>c6>boC*F;69(f>i>7lz$KV1B4MU5LoiYjK?bMk#^JYjz73@ zSBsB2XlOnFfOo_dL+wFkWZ}AKtJmt(7(CpLc2vv8{Q4^US%W;{HBjx0C!n9i@CO$$ z;SszwknE$2y)m!0p@jwpb$&n_c)$tWdCXv8S&aYK{a=3rNbAsaaHFjl0dsoH)wFOl z!GNIkKaKxbdD4U@eYUYN^V(?c1tWu~=ko_7uztdmZ<`Nq*;RUnzl}OCX2#mAnuewZ zUt8O`#>Qv=pPp%diOev+^lKvVE9ildCI3UA3h&M$Qc0)rEiyv1>j*afFByk~rZGIX zvw8{cKB`)A#nI?O<89e`jND;n%Qr_pfAN3xOScWn_)B#9HBm*q#>$Wvi9{yXP5nEo zWplKCaDtV820P9yYiTey0Gfzf(yOattwtJBM5%+#O+KpLVdUxGe#`AP0!ht(BK@BY-HQGPgK-*GN;0440#>hu^$%7#jy3a zo;-OnbMTbH5qB4lyt~N!XfBXFJf+7!Z4v)HgcQLPn>8U|25^w^A|izF-7vv)RQ_4a zE^i_t1nWmViDOQGe=7n#+_c+98Mbv|Ozw~X?`7UW^Tcp_G`0B7`I^u9d>5LV`S)vM zvt~jO-RgAp{2BUa2 zw6tPqQLMVq+;pzlx8kXs>S%vsyXuaqA8y^XH{j%F z|NIBHNhZNg%0FXtazca9u@cm38I1crc=@O5MRaOf6cS-y>)ig|}vEP32m3hTs6q7~tn1*Q>HO;trxU_=Ghb0%7 zu`JR@##y#WDj?kDpU3}=>VUj3hn0v1wD&&V_Ce(knq`L{+*=!)y7IAX@*3{Yrly1Q z*2vfb-M>-E^Y4BUQQH5aSrR3HeHbvAf5zv_y(UUOc5`}+w^X&ZUcflspCj*9E>o8nVQzTE`L4M z>d)$QT7AuXS8Tv+Pkv>F@{ifYt6nSgbvAJhN&}r=zgW=nuO&3xN#m|NcIdP6{u8OlnevZm zgexDn%0J$e9bB+po=1d3%Z*Pdf4eS0PBOUjDwW^M3B-HgISp9Fk|Mj}FJrW}Wi{fTl@6h3# zp$X}x{A0zBz=6R)=*lNDtD?>POx$jmXL9SvlfV7yul!9V0A$I8dn&{2Cw^EWUICx} z{VX2PqRo?lpm+V>u^_}yN~b-BH+NPw6&63ITO4tbpz(oKJ>i1(n_#_#||GY?>9lpKh^(_XJfb(#-aZIXu*2vv;u?aTUCiP z_<$ooQKx~{)?ZGZ%D;Jp)YZ_I+tbwlkZq;%ob_ia|KM};a_(QE8|WRThibS4W3x;C zHhbRi88y+#wSm97sO0>=#+mt_Ucw&wI$}vqO{V2qIX$3ZP{-il?H|)DG0AE{rdiga zjjEh}R&`D7AC&)_tx*&~61cVP_-9||^&VT!bB6HACmc_%L=$?GMW@nWiq>V_bpGa@bn`k~)Ot`PotSI4Z=JgzN^27;Rf`GDb0X?orkn+zeC2=Pq7GiW|_ri6k zFr`@5#Z2EYx11e}6;E^XyY;IusY4RNuyFaf z2MA6tG0EV(J^=|~GR9{+51sMKk6bjG`F0p2*FVgD0o&6BlJNj_u7JRpz?lgE z-L!Bt!7787e}BKmQzwg)!j`u0zWuiF*0J1vrezmlNjXEzWEcjiGv24~UyU^*^Z@1J zjc38$HShRno)xX7J$d$Q&Mp2&Fs1=l#Lf@}Zu=#Qi4g05=wUo7#2E(cs901NgH{~1 zyHME{+&=c}!(Y(r2Dr4f<(p5wEbKd8ZjhRYlzFL-Ar6Y5ObjEiIaoi=7ghdQRy2UX z`0&anGqB!`%QpO09htm){>lb3!w}CNUt3GY^|AbP00#9>-nDVRo;kzMB{72uw|?2S=LWp2jTMlQ3*-YD6 zMvS0}uvG^9Wq^hSL6(F>3osxe-@O~IfzLw+X~)5ui0|Tu%xP})Hfe^$qBZL z9QZl?$6Jq4%$|x!shLLO=cq=}!Q3#XvMLsZ(0a&Xa>Aa4YZ@`H1}3}wqjSeO@-F{& z``>Ajt&sKLLTBDM(i%t)OT;qg#gC`s(T;S{^;`DHtCi{W%ZZBJCrgU1R1)CV)rm(IKy>Up&GYgtfj>u4R*eZ>P1PpsSKoE?)wYD2Iqp#1yiKZIfwIvvf2HtxdfHs#)g^y}j;P}OBhg)w3I zqv@CzC+3034YS(qAti%6`kIgI-d{dol04v$VtMcJ$G<%8vnN{Mf_}L6U5xSx1~*I1 z(4_qXbmbPK;OmujYR`;S{!Y&|jK5+p;#Vj@QBA5sV+!xnn-S z4_|*zyW*JgVQC?n#aqucG-cf~97w)o7$siyN(qMYcGj<`uv`9cGKM|ZW|ZR<56gMj z&t-rUFll*~f6IUl{~!rWBs>1ax5YiiNI}9Xk8R~|ool$TKD(;cp2CgTc`>OwYL}sc z?NFt4Zv-pr`zfsamB7X1BB)+_=3`&s#&)9iF(O zcFFn12J$#L4%)mAiZHqgBG4$c7nzz%nLU7BQ2NCTEd7>PSEg&lZp{$<~d-l1A1u6Vtf0dM_$A7E32g8J!9|Dz!t@(mP!XREIzOMh%-%% zFoNv>KZ84A}EOMVvt9|KDPR2S*|T(A?5kj5OkzR{;m@#((io<3=0jt_;UE)q_A3AM{x_=@Z*O5J z?35o2J@xzV-+%M{p82bnKAN7i@adL};6thHp_0fB)^=k$2wTQdnOw z_MY;-lV}ONu>l3lOm9xY9gXsQp&8XM6w`rpsZ1M(sS2v z!_S0M#}^+R-H=s1x94b#S0Ev2)g!Z2(J4Es7GZ7x!R@?(#IqZFuxu_}Sc|u8>S940 zkHwn73HgKLw-nDiy8A zjs{Ko`Nw~nnKMCc;Y)|vwmavq!FC+tcD~luIU=!i?7hKMym(W1=1no$HbckExiJR2 zAK;estBOp?xGBnojLzsD^~C>u<2f^g;z;E}iw`^a)nfa&!0XGwipc5n?|#O3-ys(? zNaf!ymhe(VWODjV(do*uFfR+g8)LF=jIPA8GK@fC{|A*iRj?S3io3IFrkRb*#a<_f z$p$SgzkL5gPG7XsthT7wUmBiR7oC#VV`O&kQ91o56c0_#?lU&)*0`en6R@5R-F)d^ zu*kfa>7^4NK<&p@Dli>l;#ZyUI2LAVBOzNgEC~y}u{B@u;K^Ax#^&@JmosQ`(H%)S z114nm8<%!tWJ#Z~Sfq@0FD_H=pQAh2|3@d^9 z;k`y_zllfp9j4X&N-h}5c|`d~0duI*CNa(KC0eCRX5&T}np7P%y=2^dG#Et}P8)3{ z4%>ac=C3|DoIiLx=IwJy_B``S8IQ|NcUeOH@acu4X5nFkgp5BtGH2!M&Sy%7B^2pb z&-OiQN{I6|)FZUGQZ)ac)<1gW1xZrk<`BZe4jaJsbnT4Si?sihy?Grh5Bf8U@B9~z zGm8ng;YoF2iG{;woH}vJjDxwW4R6!X3wS%N9J_Q<>w+Hf<`gG=V?}NfVBwWdQkWL@3X5h+Amf?6^6FLlM04Y*PhBo$On~wv;G|# z#J2&Ojh)0X04o2+x9EeQ%-~U*-D}jE*(FF2*|(AoG@DAQIT{-pPyc+fBo=ino*?Ef zup4rsR&EMP&bR{pr`usza$b)y3!g~G%MN;T5F~?q2{>}c{8i~aMq$I%a+3El?9|-w@ zWB4?0GmNH*A`57mqKSw(`9-&kDjGR+=hAgZkr&+g^&8655{d!7wdLf=(^+P0rAUBX>|T%7))UCEkJke!R(J)}JG&{#U+&W=;q``QpeY#l6S- zMZ)51kxE3;u0?CiHm3m|7D3U*9Gr1fW9L$gle3ZROl`2D(eLK+gDwo#kk&;ez1l5i z!QWs0;fJ57TTGXW%q1gjQ;D_@M4;Gm_xat0tFyzVmiC)k9W}%15aDhL?8so63U%`% zlF$mu!RpkchmU{y1*TO{2NgQ`Y%6y#7|_t%aK5GK?bk|kdX2>*ZoDs}f?;keSK&90 z14LnK!RXYoJ`?8tdGN1z$^_GIXV7JZC7}6YF9@dDTBhZ!qXdBV|Gu|2&h8#F_lB4` zo&A7GyCp9D_HiWF4~Xp%w|ty+VwF{{9oP9V-FTmO-8Y1Ls@htb_D7cC*rT& z5cA^yHwZB5d=k#iuT8K9oK2*q1i{<^6X)C*D|$L0X5Sc-aogyefm6!I-Lv|+{JqOI zq0dMaK3WN)Jy5@x!SPm*>LTj8e!AzQwR1|#N6kv>H6r7>sQg}I(S9o#FsW$3q{5p< zW?mne)ptVO)JO1=_vase0iH(2Ed8v{xW{lfELr1(*eaGfvUA_YS4+yr-jms9OvbfQ z`Q77?5PqfoCzbY{h}LFix2XKS6XvHrzA>-nvwerH1k4}S2GXHlW0+=uhOw;hS&0Y= zU&H(ubL|@Jr>97DK*)6kfiY6tz-DBT(qXkicq$>BImp@g87ViC#s!R3Hk|%i`YG!2 z9KW}i4Xf=mEd3^kNi~vFofwaY<2q@un-&q&a9y(&a=fg-h}$OG>56`TgdIg3GKe7f zk_MIuGZ?SkfBO2n?>_nb`;U)(cjVYVPn>MD8&jA>&9mtk$qIy)B*0;s2|6=~2rQ&( z#`_>#tJb%TY|MS^uxmBv4Q zh?+CkbM#E5pUFIB83~y`Ci~=UO4*z`F#$cSQ$5oQPS$LqLhAP44p3K@T;NRDch`kX>CMXc!2Gcz}2A86DLbq33<^R z5XN(a#^zgSuA3R6H`)z=lMg+CKqRMN6121-eNKyB2Uwk4HiH0NOU>vHO!#6#T3qO^ zxz{sU(W!H%nVwRiGqozAg$P|wFr-!`v=G6hLg#B$LJJYPo?u9=N@yX1NrleWs)QCI zbUne4T9wd31e4Ny4F(-7$We}1=+PW^(7|CPlDZBut+g?&ISy^CM*~0y3v!er)~!$j zuIm5a)jx8Txm56l=FBpO0XkN51S5pjpq-{Q{HN%rR)Bf@i^1kDm%LhpiUn!a15S_~ zCWJY_z+XNyA+!WpNP-}k<}=4S&IS?yV~(iOL17AkSkQcCrsK@w8pa&0!67a|K?=vA z96`n~K+b~Ku_Eh;WhD~pAhZUi-FQ*RvJwo2*GVu~&O*2`!eI=8AeiQ}05qWig!Z~# z%oHgJ650zdNDvl)SXgmSy-3d_(jaCEM}ma*!V40F1{k=<{lXZLl3*~rApgbsb-EWtOO7r|Ds=F_Q9oTy9A+%y zpZ`3$vvSFj2Vbh2^dP1ILy||3>?5 z6E5lRe;@miQ$E;VcF4M(2VkEno?143r4Kq0kXzn3iQ)#3iUP|NH4@xBPcLOEYHs{N-6aMq_0qzmYdD_su+2 z`;9;IG0mW6C?=gI*Muh)MJD3yC%#JuyJF1I{*sxC0va#w3wn>;QNN6RXLXykYC|JkN75YBOi$VY(-@ry_>L$JYNjXc zfN9Ks6aC0C=h|^LY=4x*G9S(nfF?BGO*2RHwOY;B^p(?3H9zem+On?bb$odc-y^7+ z_P0$13y$vl2vZ2;Kv(zc&%W4Nw|L=^n*%Q zBSf;G^W~m!-5j4gDl7#;a_O-Ro(aGc^?*a*^~1F{likGRyP$p_H@DrF7X-1zrR>E zV6rUe!|dx5J-g96C ztntir3p!*O;D&*t)e0^6%dY>-Z)eyxGoEiHs0|YvQ2K>tJGWFVET%8{S`+W|Wla^<-ljXiTw?n| zL&(@-og~PFtuo-R;F-%o#)B57$OxWeHC`~95WJ{z2!hWX4bW-@uhk08NYUUHyb}6# zqMGKk#0Cj^S_>HSwFaR@iE!ND|5W|9J4!~}RU4kfyAqX0Pv13fC3dmU87E*{`?;ou z&v$-s@>l9&@BznRO*#&&=cYN9=0>o*Y2N}C(x@XX=FrXr?a34j10?7MgTt{LB|#B% zi-TKJubJ;#L7wcz4iCkw#=>EQN8T&=G|D) zMh!6gfLb4B*Zc5wTr&rnODM(K#ob2oUwXP2j1@HgGHAao;%(szhuzDvo@WNQ@xl?E zEY5t11tIhwM?YKD>6x?UfO#?VJhDlhBkLXXx5XU5qH2n znGQ^dK~Ghl%n%hiV4kWxnOYElE*#L%1*yVy@Jh%54 zd~S)}h}}N^^N&9RQ;Xw^#T#ENt~OtzLx0)qeL=hQ5Y63ZbRV!?1J;9M_nxR^d|9t1 za(Yd8LUE7L_^4l2pRwf;GZ)=R0fO^5MgFb_)25 z()h;Ar?A-z4HktbW%e4GbK~%uC|W`eCYF<1i^}Y*d{kL?hcrnx{E;Jh=(^=OyEa%3T<|w zY{h_f6mBJaFsN$KBx0?e`4!rR2 zVV`Kz^`^|L>@#h6mcxu6yY3%OloeWP4NE7)G}yvqDdBKXi+e_CfT;3hvN&TrX^|3w zZE&9%ATT#^8THd;!1Ep7_gS|SB_S6(65JYBFksr7DUYvvu4rfN(oc8n|Lru*Pv!${ zZ_L=oh846(ILL8{_M?hD_R&%L`V#gvicFfHNFOF}9JFBz8a&|%<)dk*N8B>lcnWQK zE|ZP;T-`@UcGa(3`FKY4*n6{kjn2I>t}-f#rlDedm6}X;ONpl*1>P5YckmgrRnZc| zkG4aJLjMAfn55z%Q}Vlwm~%r^PXCGJBWEqS@1=Dwm%LlI9Pg}8{_>0V3-(KaSy=yo zCL9`4Xj#GG^yCPBa83~~S=LxEIViDiA{br^x@qAE;nvI?1_;bSAPAV_%ot&6f@v0U z^TFXm00>@Y{pOn&+vlx7V*>?)+X?t+EOx)a_EY6Ur{-KAm(gQn(U6qI_dL7ywUYN1 ztU9*u(3yrt{`v_jHQ2)@iEY7b)_fKbL5YIzuo7T)w}{H2sJfGLhfE{w>{8sS^dW%r z8P~@%=RCEe_g>59O!xM^b9i4GqblRQFiWQMY3ACy^h2!Rl5_2e^?F%FtFXgwOH z`Jw0+LjgqM0m z(?z7F6P3PBZ3rE}-cIj38s#7B1#A^)?xQKHqt z+wq)bKGSw7^n^uM!}1wr4te3T;n~?$fYAMW@kmEuR=Yq0idby@CQ4n%+OCC2%=k{QUC=%QtVxt(-UO zxuSkka<7jo8#oD%KN@8`Nl^K2fNEHouw$k8LOe(hIE$KU{;M%yf z+eYB2R5$J6rT<9VkW;g(Zq;Xd4xIk&cWt)MM?+G`7j?bO(UA|eq!%5EY$zCqZiujy zoDIXtgp>wwP>>wOnl&vjhd2vrrlwf~W8@3~9~=w1JoZk$(7{VSRe;4|@u-=9Joe2w z{&WIW{#3q1GZU6}JDWPlDau&{SqULcc`Y{OA0=mQ51IubYSCy||7tn*J;sjcW#RFM zH|(Y&ftLcl=G}|e77a^W{P3&qGQ#9e8(4bQ%HG}f%uMC0)nWV>wBeCpVHj#xZ0aF63r3j$Gi zUf^@C=z8Pz2X=4HzIpiE8)GwWA73=)o&^uRgb!ihZTN?qcVU}2vt=r|P~0MJ6b@&E zK$T}!A!^@x$uny8t0idDpRw^SWll5(>!r#v*@VQ>X_-(7?KK0OgCRvCBp;Dc0eW& zXAUrJlKwJAI1UjV%Yqu9X{|wMCRkADU_sF);*8XIL^*;E4lB_b1kX_t6hX})LE|5M z-zfctShVv!F9bin%ZDar_ZfF&+dh0`hf9p;Bt5pN4*iaxz>I{)9%D z(r&T8;%%yEC@eHzrH56p`GT*B?hW%fo3*c&)zG(7tu8!ox{lAHpw)(9UD56F9GR^NP1)D|mLTv?P?wZFXh5-nl3J?)pIAw4j1<(~X*Wja4w$y+p652?)*e{B z?Ub3k4Fej{14Uz-HgUEtI%k+}1GtM!G^GmNY5R5n@23+dKU}tDeR{5UmpAz51M6NgoX4va7scX}!WWti) zjLSsnYpUsv)dSgu13J4P6}m{L`Z;+Fx9zWAg?&i)ifHGfRA3TjVN~*&vl#5RUU-Vp zo$3$WRE>n%w_hu+jhYc?iN6WKB>|%-b$2}lK~o+{K~eu{nD(QkE|$hyV7%wT``_}I z_IlnKDYGA0CSu; zb?VT{O>5ID-n{FnyuK6Du8l4pGzGmC+!2_+$DN%{4aPgT_g`Pp)I=+)uaN)Q+Kbh% zjbU1IM5iV$3ZTK4S3kOW>t8EJ?dmB1C}#N9Mt}oWZ*m$eB+joSyHRu6_`$fH@ zrgqfl2M+J5U%u+mjPj8)({CGrPt8{1BW2Ws;HjZc=c-M##&Oniy^*r%V#>kxF4scT@D0dv*KKC;v* z0Q3i+FvI6RcsL)o!%sh-_+a^#&ABzN-}_8fzo{A5$CM44B5R_N1?0N;_6c8p@)_Rb z@r95xX}1siTKNkbyT`$|;l_J*9GFZpA!h*#z@gJt(3QVeLZ`N)1O41stsFk9Qa(Fw zmoYpJGH&M&-BRwgLIh$H=H|Vt->KzqHHIaW44jPqY4z|sQ9WjMA6Xrq zR1tv*JT%)GufX;$T<^oG7W=?#y)O?OE9^g}Onqjgw(OEc~W3^Jkgldm7bgI4*KpYeqoJ+k>yM8X z4x=}HT=?o@;8Nm+tB?YOJuP737jgty9xlspd0{Gan2C4MNVEV3?c*FBlP& zDCV$;gDFu$h-L?#^c&~r_MeQ;M>>j1bxg{t$I^JX$6TQ+Z@-yTxBR{rD$Lpz464t$ zeXJON>eR0vtlqltZ!h!RQ8sY$u>(iY5~o~Pj%~+Q63d>-s)-5cJ2>)nS%RpF`4^xq zEFRw;rLlT^(iC5>zBUFU#`6>YzV?;UJ#TIJ;kzF=FR++qIM9>Je!E~Itm|kO65K9e zXZM>Tmj(@?Io;w2Yh#R@^JM8UY}+$d!ILQFv!sJ5Q9?Kp5Ip#dVUF-6sP%gyS%Vhr zSj3^EW3Xqen+eYB8}0UnUHRj(V(e~D{c|yP=+RC|26iv3+DnVLUS+U7Rc{E3B}LRoe3zYDw|h` zZw7Ygr5Mcu!uV2TQcjOCb8a188aH#zs^R zM@sp}qX&yU#bY3Zwwz=uv^22iBwz@Kh$E=6q}{&s;4^J_R(TMnCWIC|IC6+WArR%z z=>{{#!M{ZP$OAOsseJO+-}2G(autcH@Pv=n?L?oA%(K%LQV6z92W_n7F%D_~)B4Pq z#>NKp2dils9({(sd`QBPt$V4SH0jg1z5T6+X5%#|hAuPAdodF9$5M*TO}=CA9w;0* zQ929mtooJ3?0cSOYq2RqCb$`Nry< zHtHS>QDRK z&+9fatN(;GX%#f-0(ow2J$3rm{2Qrif)tq-nY?hu)9ezRWleV+tC3x!gXw2J&U`?* zO_~d6*HK_{x)SRq#4wH$yj!OLKuE0M-I@h27)%X&uYNVHHxDc!sD4pLy;rrg2?O2s zp~ZtK2j7NU9+0#^h?JYgFzn&2@0Sgl1WPdBQx`jZd&zukej*}FWZgCvBa;~9sh;!z za~%4S2GyLc?U{W?7hlPau)|}Q7{u*B^F3lycjVTf3ydBv-lQEra;&891i5AK;8^{5 z7KSXe_Zz^VVgWu8$ij)8Zt$YLWo)+Br0^m zA&cSiqwjuLIuNT`Pz=yvoUyTNK3+Nd9~ihnTH8*a_-EFwG*1aN;kwKUtSRguMcgbk zv!2GC0-eE1-5-a~g}ioH!hyH9(_}o1e>OFJShof%%f!gq$mCsbtUw2cbM5e92d;2O zbs#HILb$~p93VqHAlNaoQ@|yPi+*DVi<5f5fY zrvRs*Y4$@Zm+rwN6obm4$?4XxCqDtWzQ97-mQObBE~CPnh?S4;FQAn#Ho~cuOl@mx zXlTH0S{3%%M#Jrvln$JDc-#B9`ap7J!M7@I!yT22u>yq}NJFN6vVAYT zW9_g$!42~M()CzGNj=R!_74v!44D>p8}ak$-)KIzS?Rp`nLN`qHpYh~6!sc({OfOd zy@Ph6raN|sMtahniU$KQA7?(G+$QPjfff#@t2u&Xf&m@lC06tZhsA~OP|kbZ8#U<01vPYls2_$TF?Ccd&X^}N~nBd_REZg z_q{~TUyfF1yd7QlN?C2p4D^4CZXNZ*k3T^e@t*cb>Hri>yT1mXVG6DMn+BLY-V&OK zr|T}33G=GM;xn#^#4U{qhwXps;n}<>8`|(V&b%)6oUftPTno;)a*v;6hlmF_=)g;? z@P*@v_iMKu?uqx5Wu}f|S>b5qY>z}Sq47eClwc&vK?srw##2JAt=LNe)4GDz|6v%c zp1I_SbgMwxH*zTYHO7-R#!xXY8Glb@WGW_`l?_YIA2jVem1^sLp<9&i1e5fl`(MHo zXjq$bLren-zNu9F*oJwFYx#|PT(`82~YZ>^<%|SbIYduT5QD8@wLO#0bi+GaQ}-)mk!(i z9CDw%?U%-iEwFRh+Twu|OVG?1nv~aL%vYa$cEL=Tl>FL2`itU#q1udVBdMttnX>ra z=jCP;yby#p2WtS?;`9CUrxQ87#-a&??BFJ%m5E`~(r=2HH{b z)6Z#b{qgt@`Mt)%RJ^TN_jE4a+_4#)kevM@9BsllVOhcY+^X6UvvT^3$r?Cr=`)!J zH@!!ZX18Y>Yy3m@V~S?;XZw#7_8*Te7dr8eT|_ShhVqY1UDiI4YiFeWd|O-g4c5Fw z+_UHiuX!p5Q=d75IP+oS$#Odv0d5ok6ATkNSdf)FVnfs~QmjN9prb?^!{n^-Zd&Wq zG$X;$gy_@+006;{_{~pvv?8>pJT1fso4Pu)lExg(L3+FnQ+$jWRJhFJ0$pu-p@}NI z{qT-`>0wjrqNigdzb-PhV!{LG&Rn>F1%0+UPy2D#e7I^W)}Ek@p(N}mc@wk!8RL#zZv6?1fuVGgh; z*BXQtCBo4P!PLQy^}|H-m~YP?=rQvx*p(_lLuk;nl`j{FC2spIGt3_hv_$k}0%rH6F1L%`QW>rhw_EaKXii|9ZfV(!P8bLZ#w!hGy2I~IiEP5s~KcV5ZO@1m>oNBX|Kj35CeGZFsQK?#VC35|4<)`47H~P9h46L$m+UU`JpM z!~V5fSGs7y8Bn} zEW=t$dkm^{Nc^$wdogBbo9D9IW-tI}W(Zz5;Jazz2;SD`{L26bUYM6iux*#9qyurv zj2(TpsVVcXF+m6ay_&UX3E(o-#-j9nDXX5&uz@9JO1U@Frlf*tSKgI#2iuMBme^?QnYGB2Zfp8D}R_3KZ+#Pr&(T|R76nLA|a*)!)c zv_dtFF&7OTc?5t!b02+LJ%jnYX@7K(HBVE)#8EiJ8mGE98w167_;(BtNNGHqCU zldtK>dj~7~)5Pp6`pdg14qvo16CbEz*;%%zI%)cso#2O+z^~>CGKMf9&K9lX}5aquN=r>@nUDn{5fpWWy(uGkcEaM^)*- zM9P8DLWDY4Mi>%gm;{YZ4vB$QP-6^?lEzU#>uI#lyHx)14zZIrETk4YJzf_DFVX8Q zZ0FnBx_M3ob{_MWf9q?=R_53$oH;B(CM=NcSRE7*Fa^v^2}+cJIZDo2q0mf-h$s;N ziwjJGEW$yiwTRFnC>+rsQ6jFjF;h6~GOT>rTDj0ay{c=K8zyD-8++)TUHpEElTZbm zJS^6Vk;1X?c320QW^pLTXy@E(Xz(9fK-5fmnE8+s!qySMVI?|B@EpZKIWtr6tRES` z>#)~1`YO$ehh9V1gX+nPKRoq&GPQ0zY#N(eR%Mo> z)faj%0FOwlA)4EF^vCb)D;alhm1~C?_B_|{02lS86`jTb^k?2Z&(gbTZvS9Im~rPbPWE^LfwC$D0bKV9&gngB&#DdN z4IHLW6+|UtCY8X1;1Z6`)ZzzT1yJtm+<{Z9${z!6So!;6K34k#x9;B=rPEAE(9mh$MChVzY0{-G~S zYCW)aGgjgRb9hbc3`~B&_R&b%to&)u>qR@j^DTgO)b{cP*e?;WmuNp{tpg<;LGnq`pj*75ltC^-*jjg||3)Pg%|@kRFfM$ z(x%B8G&$HB1jueHo!{VVf&^Z1!KBQaI=}DuVAWCAo(0zRib|dL!1MNtfFQ<&z=o-E%A!(HOSs*OO}*Zmw+h=<;AX-8Kfkv&=lrj59{IsjuX6CzPj(-`OcLl3V?0gu zWQxc|5uU0%nOdy|fDRVqD0i&pK&>O_;II-&T?d)g;+lgVC8m?<8q9q~y2e4Nzk1w3&_8R-)iY*jjqYZ5{_yJ&S%j`W4ySuo3i(#}w8{ajPf7-B> z`th&6Dd|0q^qV9KZyx#c&%bc1&$5Jc(_trQTc$XO(f|>3h*H-xZ)w|XJJ<5Q0NE(2~M-mw{PW!iXm8u6OWp?e8%Iy`!KoHM-vz9fmvzu zD9hTgrO7*`3+udgn<>0~{P8c2)6fCNnwpyrZGEq5C{0b|_J7uO(HB|+mc_vgK14r) zY@Tr3K?ldJft4`K9qSO+K?Xa-0bq>83c*o)7AYNMB#ZM|I1&`Rs1<^v1fYYsMyO}j zKVMWEcX_@R;dw8hY~=SH^Ut48(u7`9Y+#$1;m(U#uA|U$KvV75haVTw)<8*Bk*SLw zc!2;)p$Sp3WcA+y*wJFnjnO8ae1UHn@2q@_cYdI6e_#pr zpm>fY73N@qn65(r93w#xf_2jx#<3bBf*=GLjOi(^wRr;E+8iPh^c0s^wouMmt)@%D zrg{7_ChzXDmHa{=^~yX3?;0GGo^#f z*RdvX=hb%rSdZ4G0iY#F1ePSL=UV9cA==p1oOMH-jJ#1l8lO4t^}tJN<;$}DV8Gd)&4Us+qOZFAd?Vunk#`mwC?SekfH6Os7A zs%>ZLtXm-i9AI})1RYUq6EiLQ%pVOC8idB{Aov^ykET( zpSSZrf3HC0@4u(Q64JZJp8owD?O=Y9hYu^z&o<;=7lXln)Pw~?6UaYOa68jxz(A9L zRIL;?N<7tEasP7(H#WR}*R%NgJk{_~saq=+U^L0FVM2R$pGno&B>?Lku;6sy~7&8OBol zMfA#)e|~p^`akF9QKx@Bg<9{T?1#-5{QPv!!OB7OU4R;_d`bC-l@UA75(Hs*0fib9 zNAXX_CmD=$6TY)*>QmxKnX{v9-F|0+V}Mo+(>6RHMS%&1Wd$q%^x}_y_EkaevB(-) zID6h$i(yab;&y)8&9UY70BK(L;Xj=?!L^5ocdFmhX3l`ff?(Rpn93&ee{~F++J?s~Ri(oL6 zR!sX_Rd_N@YTfk#a>LKIbH1jmYogKQg9Gu5&g^DQd15u%HkV?Hp)o>pqM}4(nJ=+| z7l5147^Xu!LzNq;nVNPZJLPzaYbIzM#j$r)h0%A_rL0~N0$0%d(yxg=+v>BHwsx2Y z*v9cxEb#%0=H_ihZ(xEAS^{NZ@g;pHX5K!g1XJbjd12G+vhDL$9C`oXiLsfu@oA@Od@loYb+?Qv)5=yky-Tg_ApN~ zHQkA=UeTW7nyG0o1)cCb#XXswC<+qosY)|J3$)cMB=+k_PbTHj93^A#$-aL0 zxuynwuTAqk-usXii-pm5FmbmQ4x(NUTt_C-U+3Y;bb>+)d#dteYJC87;eduNNQExq zt=|m3rFDxzjry-C|G0=Hrr*k!FLe9O6>l#cH!GvZ2<(Je(q}wANsKa&X-hOW#;m16 zF7V_+MreEg$RzX;Q0%#!Q-L>6pAH^vrWlK?n*h zgccE61ch_4`pMz=s_e;M3%x#g-9b6Ilql>kI{(h6U@FzG#^TWJM^m2jh%G|D`RuEY z*6i3``_}pwir;$R)zV4#=k%F8`<9r(!IP(N`nco|VM$e)aJ{NM+ zsKc0H`LLwgsOi`Qzc)d#gxBy=9c?fcugb_eh&&Q zgs1+CVro@F3lU6}06;J;XT$I_IS4&&fFLD`VIm?yPjQL$6c^XfaX@3({ls_>7iYgJ04NIRe)Ie&=xJzcF+2ZQqtjRywySyxZb^4qIz=5T#%KU& z2{Iu;meoN|fKI{za0JBwty%PgV4BZbt>6uW?hzb9YrLBljwhZ~;f%d>)~A(T`nErz zlz+m6CCu&~BiRr$#9DBe7ETBDvq#Q1HJ)p1EFXVgO=J?5O`!l&(Tzw#9iG{H?AvKY z$3LeX&QaR&w4-n2p|J7KB!-6wo+Kc@`1n|T;y;S|O+;szp8_x#cSK!Oa_NAHS=Yzp z-7#(c{m*X6tNmc@j#KB(k~EevwzS{@L_@5m41;M7GR>b6RlKcu^@lU4^J`QQ8z@+| z*!C0q2TT3bH<;M3y1lWH>ZF`R;_{HU(I##fJs15fZ#-f z?+5M0M%;YNYxZXN>G!t2RoriKeP(`T^c4hD98jo-c>S_*==9>VGYWf+t($toGyhod z`Xf(&_5R0FnQ)ZhCM`}7=IsUBy|afw+^l0I+QR^wuw?3uHFl;0lwoE6$)CKl8(S31 z>DOhq`_K(p8@24>Lg<0+V)RP)OzqSgYSO&sC3>ezrR~b~u$(O`oY;;^WJhsURmFXbI;ByAC$Y|hI^l1^zg0?+m4<*VfhTdc~?!& zJCdC_{1KU5F@T^TWgk@zM(cz_8;>n=3V)Y*4Ps>`O3p$qLACceaILGWS`7XfQoERGjdW<-l4NEReSs#pXT z=Wv{qD#y=$ zC}S3y#OlG*7EQSJ(_OaBN5>#iHYSIZ%LkqqoBhD&`xgwHYGzqV!_$PkO$+Le96chO zCr&l*es#x|hgYw@qvWytznAo$xTx<0JUhU~${)%Tm9`4LS$PfVIVG2lEjT}O;qaX0 zH{SPr#iQ?T-1=i{0gmhdm@2-OKEgvnZ4{UWYnZlcSr{}=EI9>!+|xUB>{#*Hqkf9^ zpVa^z=IxKKE9;kCJLLuh>5o%eMCMjgHG-)er_pyh(bQCym5a$B@Z_s#z$6SSfmHK8 zWCR`Al@5s{Mj&1qRSHOfZ0q6a=BB1&$G%@U9OF&REb2emTqH}AVUIEMUXguMLf)3x z^Um(Yy=+6R+6U)wz(G-;o(DEOYo7n#8EDw+r`@#f-pW^>SpU^mUz>dGogaHW9qA(C zFe&<7mU&^r(k>j50}MixEikfOC`kNbbk?yjV%KXr(uG18et>@!3ofHRdh?yi0p{6y zm$(1e^=JSwiFuKooO({*ai>mUwk=z`QYsy^gl(-Y$BrIbFg&+<#FZEiaLxVYV6x*1 zrcN6_7BtSMT3QMRUeTC!)s`izG5JMcPPo=ZcuTIi9i#G^zKZu}>c4&NObY}UWsUY6 z7qxO*@&^2%%cuz%iE z=NLgSw1^!4_+qLfXkWjqbD<+)6m$|!yE|5GtTAIg{{;2lb^sc)@;8({gqJf99zBZb zU?Q!RcfyrmbmUH)o@##ET3UWMeriF_aSxA}Rg#tW^%q~C1fw}8L>3QhRKF}1ZPzfz zE{w9<@B|9I^#}WmDH)y%n6$VWFEY*spYPsR)X6baF$4i-B2Ld;JY?GaXAdvxH*v}I zn->k4CiNfJ+N=KZAPzJMFJb(IMK&kMktDJedb^mU1)&ODJHCR^M#xF2{(QB2C;8!y zw@o&bFKNu|;$eT#tl^!sN{ps5WO~8xvoKPV8Ry1^n{xa(RsGoBR(Q#T`Mt(Jw{VF$ z7nliuO?hIGqQF|qV$>D`jRwb8N!ZfVboARJ^DemjgY6h=H|pMFgh39KGiNZPC0=0G zt$nhxIbWF5hk0So43LWxgm=nl$e6idK~0OfO=~vJY)stEVf+Sd7mfE@EMHV%YP>%c z3>x59C5t#~?M|6O9LJDJnyN##5=stFF<0LBV12>~uf4N*m|bc5Jn6Y>|FNLF-}Jhy z{FQ%u5M^ZkQd%TtI73)8?oXv7uKI5O!4r5P0eXs*QE_M%$`m@F_P*%0$DcWL=y05D zahPmd_7n!^4^vP{VIu8+dfwKhYw*qoW6+^erqB_8s*Z?K zP>M-!j)PB4jZ0|u4FGstj&;oagtvEETIc;COJ4%SDYr05EL(tUhtGIt^$WN!TnvTFQ!!ZebJEIk^$2n{nMQX4je>32kElYm^;c8cY)q} z>GjINQ&E9aD@pM(?)cgzBX7AK>J3?-;{~IU#5j(Al^IDA9pkW(M7wCba1s`!;*LyP+unCRT-0xJC-)Th7v52S>Yj3p zq+q^7x0!J(bcB={7%e(N7_}-g)*37rTKuYG(S(vjLd}mnkWW)v>yZP8R{Z(i1s7yu zHX<|EyuG7amzi67$>^{4e%@p+!=2nwIlspfigV2q+fLSh>=(>#`Sd?ZaL38C__$wZ zFMU}CdUJIA5|Z_Z90Ckdj8q|Li;yr?t0FsK$K9`zDwg0UOo1d8xkJ^Fmn}h@(_eq` zdCXF0n(nSR1>m#vR~lAh7fH>H6IJ-6j+oV+(#G&qfk;gS**2Hx-I(h3)ZEIsmyWK@ zhzyG)4YEDWW#e7Y!hYj%9d3>jwFm>8=l&tH^YtHxPh&>@=9)+GAPkw9qtg7jIWm6< z-yox-A|M4pQ&FankPt`>rYc6NjG16b;+h!|AQl1WQ~_{JJS`?K`-j^IFIO$Y>szS& zU2+Pj88q#KEoRgXsTIhPi;j@8iNr3nP=(;6(js6%a;+ICI}BPSn+TF5>yxc5-+ujF zX&UBOoQ3BXQsJGJ9CiDl%=~3H-fy;4RW`;`(sy#_>p!x_h=|)CSnRgk!~|3lM>f7qt0j0b{Ak6yGz+?cLakp)ay1H?-JWPqS8 z4qmH>3*C&ZOP|8qs!`h&~WF^(Bv9VW^EL(WT$1aF^LxIeg+N8oU^*-=0EmS z$@!xXe0$J*ed1dr04EF@bXt>t$2dbYw;z6 zIMS#&wJb1L97oOU;2ma%L90Tm-AOCi5v(y{B!C!N2!`OKM)Bi9uf6^Ak3KH%Ju&`X z2$_nuu4HI7vOr_y1jCTTE`d21zbY{8E3$T{#W9i%f?#N&DlSw7i9SY4bIT7WPOiCq z-n@%O*A1RtlbL6laqs>Z4s#Yrr2ePPSUC2NhRtp_1azWyt=d%G>6ru#|Iitgqp!80 zc2q*EMjBYiqH;^AYKvoJX%It;q4R2vlPXD)DtKBPBiXt{(Bxf0DBbhs=H@q7Zo+^e z+@YNja1-^3+X^t0Tx7sv6=>JTugD>GEkD?>U~Da{-+XTK%3B|pKXB6g^GD)c?)vnZ z4HzW}FC^Hb0Bii=P1m~IzZxGKmGwRS)ZL{RaWqUPDzMDu$f~uoyFPrS9VkzZuILR9 zsd=G=geMLAsri!*D4~PBK*f+Xn`>8^9je;2)MV^vjGg}a(x>r24fQc${=XK0_YnKc zvG0#>U%heN{F+ra-d{8J>XN<_=U+N@&V^YIT$s7wvZ?z&`W&B8MC!ji9v6+8WkwK6 zwO~zon)}IScGp#ZE%^0SFz>XB$hlyrQ+5OT#2G;18#Hs&TOZu0 z|Mb0&OZ%9wfIC_LF-|GQAlkT~0i56tj7{QX5(Tj*wT|P7kfdcI z!wcG-G6Y=cJ$B7qo$UWa`oKf83x1p4WM7PrG=o7K6+UK-WXv#9rl5`5E4&;?rHzqn zu|>9zk0evq&7WC3)ySsrcjipZRe!C;D_CFhgT5n$RBp%Zn_85x^$6F{< zK#G<7{WV4o5>&%(?7mw-eun+de2+jh=_O}NBH<{ZjrKbyu7%dPlMvWH+*eQstT{K?PQr2L@a9mvkBf~HZ=hYRT0D}cFH0cs`mEpzC2iR$+(!~g{E*hfV1k^86Zdf zM>NdKWM{*ZwWyT+Dl9b85rQoKQ~`5MUqSj+v3q=`+{o-0C5ZqBrtOfm>hL&@Pz9{l zeiss=&ft>jopoCmV{V{M_WxzK4$G;ZA@yGxDqDG^$+ByIFpOD}bmWXvCs9Ufdz*RV z3}1p6q*vMmh=8@`5m~SOFbA(yKbZW0*(lDGkIJv>^f$k{dBG`QUQcsXB~R3Kj*N#! zEr@Kw@-5ovm&Ky(npWE(YnAx~8krg>XcsLL9B5=}LP5KWj5X^2*7_$JF&|1-)PIbD z+?Zo#q%xH~QtYB2I=6}@4A=oMenV`!8O{6mezkbo4R3ASj3nHdG@nMhbNY* z@meN6(8yGTgwf%iDXeMCz&-yiIGM^JU@igh^5fC}?*r z(=pJ<)XocC&2RW{MfL2x7Ml`PQIjnhFkz;n z@Ae&-cgg6=VR?%OPcI!XY3+lRU+wu6NvI@dC{8&RK%J=CgwNfO$5@brZvW@+evHA- z%{vlZ(V4_9h}{3dJa;p0Mi#bNi4j2vbDa1?cqfL$;-?)#pe)on2^NcWS2_p>PZzQg zK#LYg{pj`gb=5FUW(@r4rvLNBC;K)$SY18k+PS^P-hb(+IcH}+c+trDJts6|-?(nh z;@x&k{*z{OeY2So71k0GgVafqDr-Ds`tfvo!G$BubiKBw;mw@JjG1K@jV&BFb=j>C zZh!3QufO}wxSKwMd4a*tsq$ z{(K)JVeYVqOp6wX*FqdTk_1VTpN^j)#fpajM<8D843z@xkm*Id7&VNTx~!`ce&Xcp zmtYlDF>OZG=qn$YeP`LlqZ%^Mr^+>>_SriaxW9y%DXP;jR&)O1ODC3Q&U|~-1{1Aq zs7+1vqK+RwHV=Pm?<~Ox{7x9isB$P?709U{o`+d1i+YZI@{R?OlaL|p*uUI%DG(M5 z1p*v_c(H4I<;2}_l7)-!^iHv-aDWhUMV$|^vqXpu{NIV7D zA=8C;F=`kwS|DDGRFf6po~%n!c*Ad6EwiO1mv+Sm8n%*S_u0vIb1_D@>A2zsINHPL zCK3A-6}2qQd^uhu!A3}-Am#@y8I9?^oZ1qlngO_GU>f%?_U<#^ zj9^KoL_?LoB?Pj$t$p>qWfYM_u{b>+&pit+ z9bGf>kBzxE7xtT2(tAS9@W_{yrn_Rl7C}|PAI3B&95VIW{Ri<%4K4}%K{{J!HVyC9 ze7Fa*dVwc?*m!NMK6BQZdl$8s(Q&Z|eWVI8F9=MtFb@}jyFU?(9IoabRQvl|-m2`0 zo=@E4F{M&P5#ixiLwepDk8Wu3=DE{BO60PZ4WaDD?djr9M+#ng*OU)wDQJF228J*S z#Av|+BVgD?6@f5e5LILiqs0K)Pw)Tc;KK7Sk6)1zreYseVb*~3{K6h1R{m}7$8W#K zU?6<9>!Zf2{#Mm|Ii(AyP1u<)r;|0@Sh{czjcc(YWF zOytj?MD7TQu_+9}CT4FDqjn?=Mn~2#N!1vFAqbMGqKcg`8ns1aCXs;{3W6a(1gd}u zfOxTp$|S*KaH7*0LM!KK%U;W`HKN<#v&BY_>OoKH2$x z(co!IGG?NKL@Lx7)4q+E^-NJ6=dopwLV~qKD=J>C*|C0e?a;)HEy&RUTRie=@`wVTk1ddrMpEGY?iYWVkm{Lhd{CUK+Dz zE*|x#V|W&2M};+cMHj3I#)}N%kEc(+{?vwreW%n6F()0V{AlIo{yq~^t4OHv<3|Pn zlxE~rhPczRBt3uiZ3XrK^n$SAsi2HqRIYU@WGv0CEeAgPYQd!w>au254!WYe&*X+- zdBr_OA3gRXQZdMLU0N4>Jrt|h=k*wkTo9#fg9*r!Y0+DuQ?(oitYMI~s+mRBsap2) zI~K!h7xFkN@ey3mkNA?oQ5`}@CBNC_c6sd2_hA;cSS26Fjy4<@m=9fZm!YxuS0dHO zt6;+KtZ@oQ-(knbE#-a2SEwg$3wux4_mTN(Mgrs3i91zO`A_48ht@Y%tjFX6al*sv z95_KN>^JFySKl-xrN6|j> zDc?5{7a*vdYNy?TIk0S13<@6K45sHV{mVV@h+H=Y(8Vk>IPn9c7a+O7F0u%O(MUTi zlFB4n=%oP~CP-R@lkQ??sumpPuIINKHSfF;=GLXJ^2D;3mOpLA6L*zG;_=E*wkUl3 z&F!tL?wF4;?IW!m+W+m5_MapyD_XRGLj`*UXN{_H*Ov7jUz3?%HOxF~#s#B1YpxlT z{qjSrTFf^i9Xpzw3CoikxEQl_``+h7M)Uei1m^o6P86Jb`A<>*QTdTe1BN}nt;jsU zuzkN|Zl}Ea((5&Yr(qZ45Ge0E>GKahL8~4}NFGTA zo3uw*x3!p4#-4XR*f6iAV%&9wy~Y*(@8R?Q&+vk?GcgbQQ}evz)Os36Z;_BZP57b? z69aB<|N6_XN-rFZ&Yo@%P!0_k?JDnfxP7=`>mqa@lRSrNw)irJm3zLp9+x}_F6sEK zi)^X0z1VQgCgq?l4qmGw3z)J7_}N&_aRecB0D(pa6H+CW?B@2?_77iu8y%1M<&KQR?dX?-sZ1Q^HCE{n z0n=pWvNQt^TD<8dIdS@##o_>=;vn7j@Ke>g=WlP=Rt?Rm8kG!|{X5_!vfJAC)ecSRSC3{`+JDtR1V9B4=_?=|7ek3YdHeB2ft8;v3`iGl+NqsH-N5`~0> zK>pGe)G+*^-@NLEIha){&MsgQdg;Z3r<#qmzMbD9!*k|L<Ekto+M;=6Y5& z)Sb_wlMH5C&z#xv#5461{!}|Mzk0;X>Wut_myB6Grz#aR=@fw*Vo|YVTfH*H86X}i`$yX%A_mEwsaj=VIu){Z_p6RY6G{@@ zjbM{YdYjuX3F|*@c$SU2Dh{Ea-wzH9udx5LC4;jo2Tu9$jh%QI15p7ci=P1&okSHz zk+p?JaiJqsfHhvLBHL6WVjACw1pDv!<+7B~U81Ssll?y^ez z43ol9h4FYR&3WB-N#+UCy z@&Y4_Z;T&*eBUss;rW3KxgcL6wM=6t1%frk|0h^RGFz9L%4*RdZ@tGocWWgr#)kl~u#?Iy-1y z+>$r-e^}0v?3fc-SgJFiv}c~nzMFz z&XU~Q5RKOf2$e~~_}g3B-g@!{^Ass_)`p@6nJF?9wFM>Q4^I_{jKy!t;dI-|f0bdF zcia?;-*j|76*Eb#xM40LOtv<}ia&5;Q!pmW@8DypuuGDx0|N^LIQUp9?2?2xZC-zL z9X?sb`wa1SQ*GAF_5Z9mg{NiekQ5SzN7b-<9ZRMQ)L?cGHfj5-xEEu^KO87LD^3-OvQ zl4@RPZ+&6W(&|h!VhQR$hHF@J(_BmtCWK-5b&fg=GKhdPJFcwf4V+}~9QTg!WmO?Y~Z zb2ym7PtSdO`SbYb(mvKUkt82Kw%g#Lgta(3QM3!XtedhB7$(qGEen+uJbmPU(pJ#>~9> z7`+-4RC)hNZ$15@@$zogqqLUa^Z>rWO!h1QHvHk%944SYa`>3}EH1?2#2>;VImpk# zuQSzha-mzaxJz>Yx#$n067CnA=*`$B7^rIC^mo>8X~C>^ZgbKRNMLx{*dgQ9i$DLX zVa(MV7Bt#78#_qiNFDA9q>Wlukg#H)J3i;fV~>R+W-ZCgE9^C9?ce9uc?0{i7la!@ zrGu~7_x>kM%^l@~;{d(7Xa7-Fn>j1tO=hPR0<+ZrIcH#F&E$9(qDo;TLMgfD-k=1Ubt$;=-(`M~~z z7|s+v<_u{UxZ{lZ^vwmh!OJ7Xspqj&ssG@g+sl0UE~R3HUrPDtU_|CxHa}8~R6+<~ z@S_5aepnbwgbKU0t?l!VKCQs8Y>E0m0Mz3{@`N`-WIT7sh%_&W3Z%F|8eFm>2JS$_l~LfTY&8R zfIy=k7RC}t6=)2@=Enq5X``0??g=g8jxuU%X?b(``r2^wlB4~{V}j!Tli_~o@Nt>h zFfgyc@U+p$X1a>j&5x{EGH}MO&97n9l&DG*$m5sM;rc9 z&5Fj*!=7Mw)Tif`^qKh0zOS2`n*6j$SZwUe{&-LEkerBD^yVZ%6$9!Nh@-}E^2|0E zd>M4`%LC&T&QvC zbj=r8>VHi}?&@pr7Va)AB&R?el^dI~;1EI$xtJ~WErCWKDwH8YWm={)7(X1(Ui~E> z%Yr?Qk-dw1kKeIji|lH79IP`A3{@K;+h+1iN#83rmNhoToy}7-i>#@b8fau{Qi2H& zbGWp&?%lcjo^#SR*DiSR=6T;}~&^dr`khkKZyE=K$Q4_Hv9Adti9l2pLzDd)|EafwR&vfrWXaLN6vT zTh-7*lL=gmGyWorrt)lNTLNVo!waCovSMEOhWqfo$7y`tVs!YM)fwFXUvpQfG#cpA z%l24y&dUgL--#~7$7YL^Q*?hdLpv7FAFdQ|e zo4GD;e`)>m=+2x%%@;Q>KN*|7@Um$b=@&yiRP~+m{%bqXO+vaoVxw5$iZ>;mE~u%- zS7RaWFzA}EWz6ESZP~g=2}I=(tc2WB2j-~3)Ycjj>A;|dT#!jfWNI+-L%{g_sSR0! z&vk2N{uLh%KX%&#Km2e~M?!vWjFN%CbBF#{9N7CM-s!Sc-c}#feEad&t4r5DH0`E> z^Djr&df|m*mR@tm-nUKn8+8C{QwgK!x>zyAk6du}iOvUR92(zz^UZ?u#$bYj+F^P1 zxwlF@u_8dbFz>=qi$V-itt-4E*AANY-b=5`w#Ujy+YyphyIf9y5a`x4ke=8L>b#yoVz4exAw3)z{D3MhRU2giT<%+aGqi!ZkWl6yXwHV$3f zr{+`^T{xx&Qzm&&myj2pG%f5izGm9>yEeTFxE*&KShEE4OInDvm$?fyc1epae|(#{ z{qbkfnXey`Q`&3XzK=g^u~VEnt`$iu?UeWoDk5IefjCW@=L~rdHCh*5mWzhLbq>3Eb4bf z)s*YrecsGHddi#Xkzp`95oF!g+`Or3*~hQ%G|zl(NT|(ww(XqPZz9H#mYXWI(RT|m z&VTK&-1^I=74)C5aZdH#_jZHIbcrovWV^hrlUyW*z%>A3$BzTns>lKsxw)-%{e2a9 zlC^kvUVXM5{>w7xTns0AJ{TP+tQ`np&q{t@N$QXdc zsgndOcD>dvl<7!|CWKL|WTTxH2%3Si2u^A&m2AQW1ZGtjKrPZ$@uTw!rg_`~r+n9>3x4#TbmS z6Fq-h6w~LNKN5sG3$03EOx3E$0_Iu+#7mOM06|+EyjDdPFxMIoi(lvEmxapl)keQd zQWl{?9e8Ew8oXE=r@kdl!s}kBiUX&;(6}6>b^=$ac9Feh>Y}&fL=X+~x<&P^cKEE8 z_S3ICyt-n*wCcghd_{$PhfFW)Kl$x7FUSK|aDMj5=O^vwGtbpO2KAJ^h<-HKilI$QaN8X6r!LhDVmBh zg)o#F3^|r6N&Zx^Gb0TUqqg|LCKY1?wzaRnv#ciEus?AQ1iU+1nU=jIBX3TRQP0e; zJ$14PjU+A|(Ev)T%3zGaURNGHauin!sA*`ZKiayhvd<(mrx;NxPA*OoP6Wtz(U9qj z2Tgfl;Zm@nWncc+2VbgMhAZT!Zktn@nYU~GW<;{7Vrf-IP6K9+#`pW_Gw}$!A!8;6 z`)bV0Z%m(A)oW7mfQc_xn>+ky5Kwv$u&@~YYVpMrZ~|I+%N#oprvVc0Ddx^q;ectV z{D{+uJUJDUR}RTpn3mfSlZ{O0s3VkMm?}wBnd*13tQ{-A3)8f($bPWKNT#wT{%03V z%{+7Fv1|TWkIT&Ho_2CO$lJf>?dT!d4`p6C=Yr9XU4P$4dp^N7qDBR8sVxqyj-PCr z+jo3|2~52#0t2fUG97)ehw^Vb(bo3O*9VF&xx8xZpALV2a?VAUR}7w3F=FPM>t9~+ z=X>j8jP58Ms@lxFvccI$zdecL@sh2I)uY$lyCn0< zz<`r7qVr!pJg4LNnwQV7@fW*0U~goiX#li!NW9e>=Xjx4-r z?CRU+nSmJ80G13CvwNcCD zfH8#y?kRXBbTK#o_kX?c52L$qPsO3aEmkzJm>U-#+Sh01;#2xK;GZ!GLDpx@s?YzM zxshrYmcKoDwQgJWTv@+~U%tN=k54>^`NfqEK6_a(2(x(0OQBtM2-FRq@%GB+@u8^B zF1*b6!@DGf49^cF7i6Jjt-?-K1g!lF-8G&@;I*t(W4HTj*QRZYhsK{dvCDSi&se3C z%^HV5?Pb$nTk#y~vs3Kox~;jruwnM?Vz&ipE($Of(BXrJ$_7nq9FF&VQhAvwPLPZH zP1^I`hw3Fm*@5x)FL$>H&Q>sZLu41?Jwrzgge4AM90lO4NhJ)TI+!@_L^1$cD3Dn! zfyh83@MurAKlWr{uSs*y8D4h5D9rrcFdR1q^KrFRGb{&n!0ZwE_&WMjj+Wg27iahV z8k4FA=M?lFcjU-1JROT_<eDR$MMQ4uH8)yMCysArh{MZFCI7r6Tx)(#^TDQ z@SG7xj{hLnwJW9gt7g*`4`@PCJw_nA;PYz97%!Eu2nWtiWjZm^jE z0)Mz?5;+i}4-lyX=u69AV3qhqb^Co0ViL)m`n3F#%sgHB@vgz>>9-tH!Kj})&% ze6oFKMSqMQnN!=BaT!jrek|+a|jvU62~X7-xe>6E!!BQ6#uD_E2kbv)5s3xRTH_?_ITI-xqMgrUKX#jiRRhcH@= zoFk+-q`5QYiW#qf!;l|n^~@VHn02S4Ekx46Zw`L6eb*b0Kleh}(x-2ozcl}rh5e_^ z?=h#@Hwyy|NiOjy_KV`TrwQvrcVDU;*Zq-AvtCJr`uh|DqsW?kGI&= zn5rM_Sp2GE=|X-dY{|&s*SSkl;2JPl{G8fOUk`46>B3YIY=Sg(UU{0`ZY;7GnCOyc%w|5?fOJ}+ZUgI@$TA}w=}L^^FYNTf4sYH?6sx+Ce1lFtLUsuOmBg4 z4sp*PZy?kS!(a$zmR?+Cm}ke{vG7Y1s)lD*51LkR!Dvj(w7ucUHy(ZN#j0gB6R%y= zYhsOWF5nb`XCvVFO8v(Jl*VZ{O0nx5N10_G*+cdRMp2pS7lmyS?a86vu1Ntos^pF5ISlgCj$~NcAPn2zR8NMMj*D-ra%apOn$rp^lmqRUp_QgvteYEhnYI zQRB&@1r30ll-h;-vX#u;o-54xgBBAr?j@SzqV-fu3%-;6NZmr0;vat8KdH@i|KKIiN^5${YcNuODtWj>;mOAM)g()i;jaB5@Zyb$-Et>g0VQ86;`>yNEB^X{=- zh*^jo3A)U++=1wvq&c;mkX3_4)@#2z@f6}z8{#gvYJA)Z1U43adg%lS#jTqsD^P-UU&ganm~hm~p(;0PqFe!LjP!NCwL zc8zglK?)UR!H7}gH7#X7*l`3>EDpat2uvFOlg)sw0v8O?Z%!HLP2nK%5O}Tso5P3R zd-<)+53hNyu;%gW@2{D1ed&;?b1xoQ^qcgG-jnf(J%*V=Zwdq4)$sMC=raWm4W}RF zs9kbukW&@jwLAFLe$z#?i`h(15ZT+lh@GigAVv!Uz*=}TUW{U=3-Q{;!4NEVhm($k ziA7W=Rr?aJ1p$yD5YJRNicCu&{#VJbgVr(*1lhS}+wuK@8SM`Js8$Tt<29!O|Dm<{ z)Ax73y>iowmCM%MS+?@Jd+IV~75AGk_q+^Dxn479DlWk5alDwjspucI+&1ZAXT6-B%uQf zqzW`Tjv>`Rsz9Sl=(V4)SsVI7dO?*P!0~JA~Q1*oOu9lF2Kbu>*Fm= z`#$+}$C{U4X{qSF&sP52kXQl(mQ4_tHpX%cY1dSgDTE}61JhKL zDa3IMnWU*GQwT{C2d1gCjBj4-P^YFVWsZ!NHVh4lD-B?{e1GiNN3ZXA_2H*B7T2%6 zwdkSTTT2FJ&+j=JlZ=%299NY#9Rme4V#r+cm=7Z;;&I0Gn!(Y%DckeGA%JsAMdnpt zjuUENl6Ij?AX+dPdDEKk3$fB7w z9;s9*h>kQ%RH+78*?dotaohy=lQqU^72d*O@&!;UkA zx8|DE6Yx`~+dkR7cjq%(wmiJ{>3b`dUw2Q%*sJmWdExmZiq9WeF<{D)^c+0qi@bl1 z_iR!BhvhtS)!)5rLETZXlqEu08-!h`Qh*sURSIGkuRvLl8n0p64w)(i4Gxs8Q->ok z8pc$=AV>`kl<7#khB4JI5RardG98KeAI~o}iQDThA7kk_LTm|{o>b(}z>O(&U&=fP z|Ni(7yWiM_`}`X!9$tOZ+$A$_S(KJr&}+>6b4NTm`(Av6W7zBxn3v6Uaqwx(qTxtsLPDzGpD~jd=x#dsLK5xR&e@|=cWf5nVOUkNEK*Iik9F& zsz4)Mk`e-`0*y)05*$bsXrxPc0Tmlmmd!TtqMHhD+I+J%H>5hXSaO#k(CC7~>48*% zM)1)NAyAe)og|BOS2{Y4CxbRpW-0}j*VDWCQ376J1pIcFPiM+Ud^JOy@(d__uR};4 zaR-t-BGZ?4?c!hv7JDpCZKq2&zb;tH-Q^Qlm!w!&ksJ(F5x9oHB~E>l!UwXOpC#ZI zy30q(1Z!OsEGTWHEMTf4a1DVA$u+-24loJDkt&h2kePyD1VEX1+KGptQv(4G7#PAt z7L`;~#gVFj3DdNc1tTt$MWrKE@uDht!pNxw!H5fGQRzrk9H|PPFilHYFyca4R60@> zFRFqkjGS5!jJQx1m5x-!k*eSc)3lTYBQBIhr6X1GqAGa8$f*Uvh)W<|lKjk?R%86+ zjzv4hk{OTqcJn*s&|N-KAU{R@LD_~`Bx?s5Xml_k)j+C1V@N^=7DyFnbR0vffmDIU zkc18_kSfsVIEGXMsRE552_0AVaBU48Mp0Q% zOmzeY5MR}tS{5&dIS|mz@03#f0&!Axk!txrw){~DdfQoCe!(ermL8A-7Alcx>E<`3 zW}0oyFkkfK$56JkVS(XjHm>N054kn3`~_Is-*I}kgc0vO37bn zx`>?ASgMq>llniHKh~DgE&|tDfUFnRLl~vRxMq@A#LF=NQ^m2>>5FTQ*Ks+(z zWD-dPLjYa009h|2fzfeflGw$|F*1pQCKpa7iGX$Y(p>`o$t9qV3zmaEzF;JVvRDMh zwYW1$t8}DMU{pmQj3gu{o=J``0Yc)a-TY30IO>09`JfQGm+lhyk1YWn_1!{u33Qi0 zoDx72;ZF@T`e9)#fmDIUFl>HIAXT8z?;FMvNEK)d!{)~XQUx0QzF{nZRDs4YY<^52 zRiM%D8^#hy6=)1(15$&jijgXOMHahOHAd_@BY&zea4mu+6nAmtnw-RIm`)V{*Th4R zMLT-7j$<4avO&-;S{0a16=I!3W$%|$c7NL6^KGD+-O)flm3&BdQ83|xx{`_rv6GE3S_l>%szi!is~RJAtp8W{(p>`GCD2_0-6han0^KFh zT>{-D&|L!ECD2_0-6han0^KFhT>{-D&|L!ECD2_0-6han0^KFhT>{-D&|L!ECD2_0 Q-6han0^KF>t1p574`hx**#H0l literal 0 HcmV?d00001 diff --git a/cmd/trace-agent/windows_resources/project_16x16.ico b/cmd/trace-agent/windows_resources/project_16x16.ico new file mode 100644 index 0000000000000000000000000000000000000000..78ceecbeec3a8cd096f47dd3a34c6660ebb9774e GIT binary patch literal 1406 zcmeIvX)u&=90%~PO6k_68O)r|7X(o1RWE?{o42EjR z6-Ki*%1SI!bYP9PCPx~YG_0flv(tFzO|Sib^Lc*9JkS4m=Kla%EOd3jt~xlc4d?+_ z%dim)8E!t_3WEW{^=%)dtZ@^2zfa0~h3+bEMUk&k+&F0%l&kDlQz=SdM7vWn4=U4r+Vr%cXs zH0DXslrKeNK`ELvWcW~2j;7*rG)v`ZkyfI)RL-z0t3-?JHI!vl%o`}ntD$^V17(E* zZOoU-I3EGwxWya{G>z|=W`o=w6>#LsX|Yi3O`lf(f6$r zeI4ES-P4Ody}vNf-;cq8L8zIbK{bZeY7D7|SZ0Pe!x-TVb4D;S!WqRVXLNMDUp_IP zIDxSfpi7}5F;7S#p%g0Q^GS4M(}g)A-1+=87neTeydf5Anb@$RX6_{4VLt)6s%tcX@l4-#wkTYi*0 zY-!&NZO!dsT6DsnRVLdeYhF&YOwRzbXPsHdK7)JyGMCQ#?lj-Xb^qdufOkR2)nfbM`rN=FChY5PX5x z+_?hua|D@R2?SpY1cGnzfk@zh4=bOkjeq85v)QPJ&w(lVY+y15o0Z|KAs&86{2@NJ zF}9I`A;}8Q+@Mn#o(HvAMf&PSD75C6cBnNH2XEj^pCiO1MHo~bosb(8p1Lu(qqNFu zwaPc`$qNq8@D7DSO+tExSEzWo$Jjd)bq)T}A!GUJ1SquR6#e_KBg;Rm=;X!tCGL>O zg~8~R`y*OZWvMQKz1l~GN6&&5Lbz_pDjev1YGasY#m(;emaCigK(5$+*fcUajsNza zapR1wqeRoGt}YBaZ|HfhiIG$O*Dw0D83oB3c0*Ep{Jhy@29P@XYSV8R^u3JP{4l4u ztEwS$J3Gyu+6V9mKpOCH|0|K5Yi{5Pxk~_q@P6{kc>t;dQJ3aBXYV)$EOqj0$t#BP z&CqCQUUlEsa5aDI?WoMjui#*md?Pzeb(AR6_h`Dufp;TsN&Ivc7$pr7hMT_CY#|b? z&g9_u`)cDv{5Yu&1)xIXKT^zCtPEEzofUP@o7)tAzrE=*&`}oHYzP36ur6linPS3w z=&GzA&~;l(V=yN{s-xmcPQ~0yNH08gE^B94S){1>dO>evE8j0aS{~U)pv=)Hd)IOP zK+f*pQDg9O=Zv_cRHLi9-=%4IptyA-BvQ1{^{;PM$kzDr^$F%?#-@k>8o=2(OIQU? ziX@C0N9v@C(umlehKGqOJdzxJB+ItME^v-p?4}BdY`m6lG`y0#1Wj77W)!nLT$>Ow z%sb;q?Dx*S+BWN$nF#atM?$6If(*UtV3cT~FwK3RY_+#^mA7p5cDb{En@XL+)gBax z2}k+9qe3ieUz<+6bhBjnR%73QLH|sfQ-o+BN3}B+i`6nwSMiHBfn2=U4Sna`x*UHt zpOj}Brk+qa9VJ!LmD>7<)T%0zZwN3B4#g~R*4?dF zM_iu0T!{-9)+c2pukn>Sc;PLw6G9H!_Toy@@CXfq0>*5qICn)~-Bd5jf-RL#1N?cZ zBVv<&^&$N63th)XO~^@CMcw_>y!_x(a$%ryZ#Xpv|M0-8)`IeXiYq5(H#r7QJ}t#L zyZ=B+!7p3oo#uH6>X^9krJ*JymHTr@>(aBDJSM0aSop+(?K4hj60E%PguSf7# z>^*Q1ATvbHjk6t=yCF?=dBOU-e@oTjF~}qqizc6_aUo2pB_e?c*^aXlf<=3>;`F?rlNRnERfa?;_eH>v z)#Q%%!s8c`9N98Me!6QPkbOaT&`)SijSc*|JGF_wcw+mbDxmXB`=}+m2sbZQQ?k;F zSmA=e<$}u;VSqnjv76z^Gg|zF9GV|C6dMi`q_n+MLkj6)KeSAVaGY+D^yCn!9Ymo$bko5rrW?Y(^_LE&=*r_iN& zYYFG1#z&Gvo#yWeNBkLHY}q5%FFW=9?N5e!Ur+^r;lZJjND%;jb)b^P%eL@GXU9oS zelST{yCXZ`1dh8t2h^%$i97O!lV7}aOLJ~b6ck9H9+={}^QxX6^Cw(Ns8G9@* v!=S=(r%yA-Z?iWq`aj7-0^r+z5@Y7jVH>yI@J&dlFFyE8c&7ga|CjtfJ>yn~ literal 0 HcmV?d00001 diff --git a/cmd/trace-agent/windows_resources/trace-agent-msg.mc b/cmd/trace-agent/windows_resources/trace-agent-msg.mc new file mode 100644 index 0000000000000..bbccdd4eba081 --- /dev/null +++ b/cmd/trace-agent/windows_resources/trace-agent-msg.mc @@ -0,0 +1,85 @@ +;// Header +MessageIdTypedef=DWORD + +LanguageNames=( + English=0x409:MSG00409 +) + + +;// Messages +MessageId=1 +SymbolicName=MSG_WARN_REGCONFIG_FAILED +Severity=Warning +Language=English +Failed to import config items from registry. The error was %1 +. + +MessageId=2 +SymbolicName=MSG_WARN_CONFIGUPGRADE_FAILED +Severity=Warning +Language=English +Failed to upgrade configuration. The error was %1. +. + +MessageId=3 +SymbolicName=MSG_SERVICE_STARTED +Severity=Informational +Language=English +The %1 service has started. +. + +MessageId=4 +SymbolicName=MSG_SERVICE_STOPPED +Severity=Informational +Language=English +The %1 service has stopped. +. + +MessageId=5 +SymbolicName=MSG_UNKNOWN_CONTROL_REQUEST +Severity=Warning +Language=English +Unexpected control request %1 +. + +MessageId=6 +SymbolicName=MSG_SERVICE_STOPPING +Severity=Informational +Language=English +Received stop command, shutting down +. + +MessageId=7 +SymbolicName=MSG_SERVICE_STARTED +Severity=Informational +Language=English +starting the %1 service. +. + +MessageId=8 +SymbolicName=MSG_SERVICE_FAILED +Severity=Error +Language=English +The Service failed: %1 +. + +MessageId=9 +SymbolicName=MSG_SERVICE_FAILED +Severity=Warning +Language=English +The Service failed. %1 +. + +MessageId=10 +SymbolicName=MSG_UNEXPECTED_CONTROL_REQUEST +Severity=Error +Language=English +The Service failed: %1 +. + +MessageId=11 +SymbolicName=MSG_RECEIVED_STOP_COMMAND +Severity=Informational +Language=English +The service %1 received the stop command, shutting down. +. diff --git a/cmd/trace-agent/windows_resources/trace-agent.rc b/cmd/trace-agent/windows_resources/trace-agent.rc new file mode 100644 index 0000000000000..445bb2b349a89 --- /dev/null +++ b/cmd/trace-agent/windows_resources/trace-agent.rc @@ -0,0 +1,54 @@ +#include "version.h" +#define RT_MANIFEST 24 +#define APP_MANIFEST 1 + + +//APP_MANIFEST RT_MANIFEST agent.exe.manifest + +#define MAIN_ICON 1 +#define ICON_16_16 2 +#define ICON_32_32 3 + +MAIN_ICON ICON "project.ico" +ICON_16_16 ICON "project_16x16.ico" +ICON_32_32 ICON "project_32x32.ico" + +#define VS_VERSION_INFO 1 +VS_VERSION_INFO VERSIONINFO + FILEVERSION RC_FILE_VERSION + PRODUCTVERSION RC_FILE_VERSION + FILEFLAGSMASK 0x3fL +#ifdef _DEBUG + FILEFLAGS 0x1L +#else + FILEFLAGS 0x0L +#endif + FILEOS 0x40004L + FILETYPE 0x0L + FILESUBTYPE 0x0L +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904b0" + BEGIN + VALUE "CompanyName", "Datadog, Inc." + VALUE "FileDescription", "Datadog Trace Agent" + VALUE "FileVersion", FILE_VERSION_STRING + VALUE "InternalName", "trace-agent" + VALUE "LegalCopyright", "Copyright (C) 2017" + VALUE "OriginalFilename", "trace-agent.exe" + VALUE "ProductName", "Datadog Trace Agent" + VALUE "ProductVersion", FILE_VERSION_STRING + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1200 + END +END + +// Country: United States +// Language: English +#pragma code_page(437) +LANGUAGE 0x9, 0x1 +1 MESSAGETABLE "MSG00409.bin" diff --git a/cmd/trace-agent/windows_resources/version.h b/cmd/trace-agent/windows_resources/version.h new file mode 100644 index 0000000000000..083087b04bfff --- /dev/null +++ b/cmd/trace-agent/windows_resources/version.h @@ -0,0 +1,7 @@ + +#define RC_FILE_VERSION MAJ_VER,MIN_VER,PATCH_VER,0 + +#define STRINGIFY(x) #x +#define TO_STRING(x) STRINGIFY(x) + +#define FILE_VERSION_STRING TO_STRING(MAJ_VER.MIN_VER.PATCH_VER.0) \ No newline at end of file diff --git a/pkg/trace/agent/normalizer.go b/pkg/trace/agent/normalizer.go new file mode 100644 index 0000000000000..e28e819f48ac1 --- /dev/null +++ b/pkg/trace/agent/normalizer.go @@ -0,0 +1,262 @@ +package agent + +import ( + "bytes" + "errors" + "fmt" + "strconv" + "strings" + "time" + "unicode/utf8" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" +) + +const ( + // MaxServiceLen the maximum length a service can have + MaxServiceLen = 100 + // MaxNameLen the maximum length a name can have + MaxNameLen = 100 + // MaxTypeLen the maximum length a span type can have + MaxTypeLen = 100 + // MaxEndDateOffset the maximum amount of time in the future we + // tolerate for span end dates + MaxEndDateOffset = 10 * time.Minute +) + +var ( + // Year2000NanosecTS is an arbitrary cutoff to spot weird-looking values + Year2000NanosecTS = time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC).UnixNano() +) + +// Normalize makes sure a Span is properly initialized and encloses the minimum required info +func Normalize(s *pb.Span) error { + // Service + if s.Service == "" { + return errors.New("empty `Service`") + } + if len(s.Service) > MaxServiceLen { + return fmt.Errorf("`Service` too long (%d chars max): %s", MaxServiceLen, s.Service) + } + // service shall comply with Datadog tag normalization as it's eventually a tag + s.Service = NormalizeTag(s.Service) + if s.Service == "" { + return fmt.Errorf("invalid `Service`: %s", s.Service) + } + + // Name + if s.Name == "" { + return errors.New("empty `Name`") + } + if len(s.Name) > MaxNameLen { + return fmt.Errorf("`Name` too long (%d chars max): %s", MaxNameLen, s.Name) + } + // name shall comply with Datadog metric name normalization + name, ok := normMetricNameParse(s.Name) + if !ok { + return fmt.Errorf("invalid `Name`: %s", s.Name) + } + s.Name = name + + // Resource + s.Resource = toUTF8(s.Resource) + if s.Resource == "" { + return errors.New("empty `Resource`") + } + + // ParentID, TraceID and SpanID set in the client could be the same + // Supporting the ParentID == TraceID == SpanID for the root span, is compliant + // with the Zipkin implementation. Furthermore, as described in the PR + // https://github.com/openzipkin/zipkin/pull/851 the constraint that the + // root span's ``trace id = span id`` has been removed + if s.ParentID == s.TraceID && s.ParentID == s.SpanID { + s.ParentID = 0 + log.Debugf("span.normalize: `ParentID`, `TraceID` and `SpanID` are the same; `ParentID` set to 0: %d", s.TraceID) + } + + // Start & Duration as nanoseconds timestamps + // if s.Start is very little, less than year 2000 probably a unit issue so discard + // (or it is "le bug de l'an 2000") + if s.Start < Year2000NanosecTS { + return fmt.Errorf("invalid `Start` (must be nanosecond epoch): %d", s.Start) + } + + // If the end date is too far away in the future, it's probably a mistake. + if s.Start+s.Duration > time.Now().Add(MaxEndDateOffset).UnixNano() { + return fmt.Errorf("invalid `Start`+`Duration`: too far in the future") + } + + if s.Duration <= 0 { + return fmt.Errorf("invalid `Duration`: %d", s.Duration) + } + + // ParentID set on the client side, no way of checking + + // Type + s.Type = toUTF8(s.Type) + if len(s.Type) > MaxTypeLen { + return fmt.Errorf("`Type` too long (%d chars max): %s", MaxTypeLen, s.Type) + } + + for k, v := range s.Meta { + utf8K := toUTF8(k) + + if k != utf8K { + delete(s.Meta, k) + k = utf8K + } + + s.Meta[k] = toUTF8(v) + } + + // Environment + if env, ok := s.Meta["env"]; ok { + s.Meta["env"] = NormalizeTag(env) + } + + // Status Code + if sc, ok := s.Meta["http.status_code"]; ok { + if !isValidStatusCode(sc) { + delete(s.Meta, "http.status_code") + log.Debugf("Drop invalid meta `http.status_code`: %s", sc) + } + } + + return nil +} + +// NormalizeTrace takes a trace and +// * rejects the trace if there is a trace ID discrepancy between 2 spans +// * rejects the trace if two spans have the same span_id +// * rejects empty traces +// * rejects traces where at least one span cannot be normalized +// * return the normalized trace and an error: +// - nil if the trace can be accepted +// - an error string if the trace needs to be dropped +func NormalizeTrace(t pb.Trace) error { + if len(t) == 0 { + return errors.New("empty trace") + } + + spanIDs := make(map[uint64]struct{}) + traceID := t[0].TraceID + + for _, span := range t { + if span.TraceID == 0 { + return errors.New("empty `TraceID`") + } + if span.SpanID == 0 { + return errors.New("empty `SpanID`") + } + if _, ok := spanIDs[span.SpanID]; ok { + return fmt.Errorf("duplicate `SpanID` %v (span %v)", span.SpanID, span) + } + if span.TraceID != traceID { + return fmt.Errorf("foreign span in trace (Name:TraceID) %s:%x != %s:%x", t[0].Name, t[0].TraceID, span.Name, span.TraceID) + } + if err := Normalize(span); err != nil { + return fmt.Errorf("invalid span (SpanID:%d): %v", span.SpanID, err) + } + spanIDs[span.SpanID] = struct{}{} + } + + return nil +} + +func isValidStatusCode(sc string) bool { + if code, err := strconv.ParseUint(sc, 10, 64); err == nil { + return 100 <= code && code < 600 + } + return false +} + +// This code is borrowed from dd-go metric normalization + +// fast isAlpha for ascii +func isAlpha(b byte) bool { + return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') +} + +// fast isAlphaNumeric for ascii +func isAlphaNum(b byte) bool { + return isAlpha(b) || (b >= '0' && b <= '9') +} + +// normMetricNameParse normalizes metric names with a parser instead of using +// garbage-creating string replacement routines. +func normMetricNameParse(name string) (string, bool) { + if name == "" || len(name) > MaxNameLen { + return name, false + } + + var i, ptr int + res := make([]byte, 0, len(name)) + + // skip non-alphabetic characters + for ; i < len(name) && !isAlpha(name[i]); i++ { + } + + // if there were no alphabetic characters it wasn't valid + if i == len(name) { + return "", false + } + + for ; i < len(name); i++ { + switch { + case isAlphaNum(name[i]): + res = append(res, name[i]) + ptr++ + case name[i] == '.': + // we skipped all non-alpha chars up front so we have seen at least one + switch res[ptr-1] { + // overwrite underscores that happen before periods + case '_': + res[ptr-1] = '.' + default: + res = append(res, '.') + ptr++ + } + default: + // we skipped all non-alpha chars up front so we have seen at least one + switch res[ptr-1] { + // no double underscores, no underscores after periods + case '.', '_': + default: + res = append(res, '_') + ptr++ + } + } + } + + if res[ptr-1] == '_' { + res = res[:ptr-1] + } + + return string(res), true +} + +// toUTF8 forces the string to utf-8 by replacing illegal character sequences with the utf-8 replacement character. +func toUTF8(s string) string { + if utf8.ValidString(s) { + // if string is already valid utf8, return it as-is. Checking validity is cheaper than blindly rewriting. + return s + } + + in := strings.NewReader(s) + var out bytes.Buffer + out.Grow(len(s)) + + for { + r, _, err := in.ReadRune() + if err != nil { + // note: by contract, if `in` contains non-valid utf-8, no error is returned. Rather the utf-8 replacement + // character is returned. Therefore, the only error should usually be io.EOF indicating end of string. + // If any other error is returned by chance, we quit as well, outputting whatever part of the string we + // had already constructed. + return out.String() + } + + out.WriteRune(r) + } +} diff --git a/pkg/trace/agent/normalizer_test.go b/pkg/trace/agent/normalizer_test.go new file mode 100644 index 0000000000000..0aa9ff9b7d483 --- /dev/null +++ b/pkg/trace/agent/normalizer_test.go @@ -0,0 +1,393 @@ +package agent + +import ( + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func testSpan() *pb.Span { + return &pb.Span{ + Duration: 10000000, + Error: 0, + Resource: "GET /some/raclette", + Service: "django", + Name: "django.controller", + SpanID: 42, + Start: 1448466874000000000, + TraceID: 424242, + Meta: map[string]string{ + "user": "leo", + "pool": "fondue", + }, + Metrics: map[string]float64{ + "cheese_weight": 100000.0, + }, + ParentID: 1111, + Type: "http", + } +} + +func TestNormalizeOK(t *testing.T) { + s := testSpan() + assert.NoError(t, Normalize(s)) +} + +func TestNormalizeServicePassThru(t *testing.T) { + s := testSpan() + before := s.Service + Normalize(s) + assert.Equal(t, before, s.Service) +} + +func TestNormalizeEmptyService(t *testing.T) { + s := testSpan() + s.Service = "" + assert.Error(t, Normalize(s)) +} + +func TestNormalizeLongService(t *testing.T) { + s := testSpan() + s.Service = strings.Repeat("CAMEMBERT", 100) + assert.Error(t, Normalize(s)) +} + +func TestNormalizeNamePassThru(t *testing.T) { + s := testSpan() + before := s.Name + Normalize(s) + assert.Equal(t, before, s.Name) +} + +func TestNormalizeEmptyName(t *testing.T) { + s := testSpan() + s.Name = "" + assert.Error(t, Normalize(s)) +} + +func TestNormalizeLongName(t *testing.T) { + s := testSpan() + s.Name = strings.Repeat("CAMEMBERT", 100) + assert.Error(t, Normalize(s)) +} + +func TestNormalizeName(t *testing.T) { + expNames := map[string]string{ + "pylons.controller": "pylons.controller", + "trace-api.request": "trace_api.request", + } + + s := testSpan() + for name, expName := range expNames { + s.Name = name + assert.NoError(t, Normalize(s)) + assert.Equal(t, expName, s.Name) + } +} + +func TestNormalizeNameFailure(t *testing.T) { + invalidNames := []string{ + "", // Empty. + "/", // No alphanumerics. + "//", // Still no alphanumerics. + strings.Repeat("x", MaxNameLen+1), // Too long. + } + s := testSpan() + for _, v := range invalidNames { + s.Name = v + assert.Error(t, Normalize(s)) + } +} + +func TestNormalizeResourcePassThru(t *testing.T) { + s := testSpan() + before := s.Resource + Normalize(s) + assert.Equal(t, before, s.Resource) +} + +func TestNormalizeEmptyResource(t *testing.T) { + s := testSpan() + s.Resource = "" + assert.Error(t, Normalize(s)) +} + +func TestNormalizeTraceIDPassThru(t *testing.T) { + s := testSpan() + before := s.TraceID + Normalize(s) + assert.Equal(t, before, s.TraceID) +} + +func TestNormalizeNoTraceID(t *testing.T) { + s := testSpan() + s.TraceID = 0 + Normalize(s) + assert.NotEqual(t, 0, s.TraceID) +} + +func TestNormalizeSpanIDPassThru(t *testing.T) { + s := testSpan() + before := s.SpanID + Normalize(s) + assert.Equal(t, before, s.SpanID) +} + +func TestNormalizeNoSpanID(t *testing.T) { + s := testSpan() + s.SpanID = 0 + Normalize(s) + assert.NotEqual(t, 0, s.SpanID) +} + +func TestNormalizeStartPassThru(t *testing.T) { + s := testSpan() + before := s.Start + Normalize(s) + assert.Equal(t, before, s.Start) +} + +func TestNormalizeStartTooSmall(t *testing.T) { + s := testSpan() + s.Start = 42 + assert.Error(t, Normalize(s)) +} + +func TestNormalizeStartTooLarge(t *testing.T) { + s := testSpan() + s.Start = time.Now().Add(15 * time.Minute).UnixNano() + assert.Error(t, Normalize(s)) +} + +func TestNormalizeDurationPassThru(t *testing.T) { + s := testSpan() + before := s.Duration + Normalize(s) + assert.Equal(t, before, s.Duration) +} + +func TestNormalizeEmptyDuration(t *testing.T) { + s := testSpan() + s.Duration = 0 + assert.Error(t, Normalize(s)) +} + +func TestNormalizeNegativeDuration(t *testing.T) { + s := testSpan() + s.Duration = -50 + assert.Error(t, Normalize(s)) +} + +func TestNormalizeErrorPassThru(t *testing.T) { + s := testSpan() + before := s.Error + Normalize(s) + assert.Equal(t, before, s.Error) +} + +func TestNormalizeMetricsPassThru(t *testing.T) { + s := testSpan() + before := s.Metrics + Normalize(s) + assert.Equal(t, before, s.Metrics) +} + +func TestNormalizeMetaPassThru(t *testing.T) { + s := testSpan() + before := s.Meta + Normalize(s) + assert.Equal(t, before, s.Meta) +} + +func TestNormalizeParentIDPassThru(t *testing.T) { + s := testSpan() + before := s.ParentID + Normalize(s) + assert.Equal(t, before, s.ParentID) +} + +func TestNormalizeTypePassThru(t *testing.T) { + s := testSpan() + before := s.Type + Normalize(s) + assert.Equal(t, before, s.Type) +} + +func TestNormalizeTypeTooLong(t *testing.T) { + s := testSpan() + s.Type = strings.Repeat("sql", 1000) + Normalize(s) + assert.Error(t, Normalize(s)) +} + +func TestNormalizeServiceTag(t *testing.T) { + s := testSpan() + s.Service = "retargeting(api-Staging " + Normalize(s) + assert.Equal(t, "retargeting_api-staging", s.Service) +} + +func TestNormalizeEnv(t *testing.T) { + s := testSpan() + s.Meta["env"] = "DEVELOPMENT" + Normalize(s) + assert.Equal(t, "development", s.Meta["env"]) +} + +func TestSpecialZipkinRootSpan(t *testing.T) { + s := testSpan() + s.ParentID = 42 + s.TraceID = 42 + s.SpanID = 42 + beforeTraceID := s.TraceID + beforeSpanID := s.SpanID + Normalize(s) + assert.Equal(t, uint64(0), s.ParentID) + assert.Equal(t, beforeTraceID, s.TraceID) + assert.Equal(t, beforeSpanID, s.SpanID) +} + +func TestNormalizeTraceEmpty(t *testing.T) { + trace := pb.Trace{} + + err := NormalizeTrace(trace) + assert.Error(t, err) +} + +func TestNormalizeTraceTraceIdMismatch(t *testing.T) { + span1 := testSpan() + span1.TraceID = 1 + + span2 := testSpan() + span2.TraceID = 2 + + trace := pb.Trace{span1, span2} + + err := NormalizeTrace(trace) + assert.Error(t, err) +} + +func TestNormalizeTraceInvalidSpan(t *testing.T) { + span1 := testSpan() + + span2 := testSpan() + span2.Name = "" // invalid + + trace := pb.Trace{span1, span2} + + err := NormalizeTrace(trace) + assert.Error(t, err) +} + +func TestNormalizeTraceDuplicateSpanID(t *testing.T) { + span1 := testSpan() + span2 := testSpan() + span2.SpanID = span1.SpanID + + trace := pb.Trace{span1, span2} + + err := NormalizeTrace(trace) + assert.Error(t, err) +} + +func TestNormalizeTrace(t *testing.T) { + span1 := testSpan() + + span2 := testSpan() + span2.SpanID++ + + trace := pb.Trace{span1, span2} + + err := NormalizeTrace(trace) + assert.NoError(t, err) +} + +func TestIsValidStatusCode(t *testing.T) { + assert := assert.New(t) + assert.True(isValidStatusCode("100")) + assert.True(isValidStatusCode("599")) + assert.False(isValidStatusCode("99")) + assert.False(isValidStatusCode("600")) + assert.False(isValidStatusCode("Invalid status code")) +} + +func TestNormalizeInvalidUTF8(t *testing.T) { + invalidUTF8 := "test\x99\x8f" + + t.Run("service", func(t *testing.T) { + assert := assert.New(t) + + span := testSpan() + span.Service = invalidUTF8 + + err := Normalize(span) + + assert.Nil(err) + assert.Equal("test", span.Service) + }) + + t.Run("resource", func(t *testing.T) { + assert := assert.New(t) + + span := testSpan() + span.Resource = invalidUTF8 + + err := Normalize(span) + + assert.Nil(err) + assert.Equal("test��", span.Resource) + }) + + t.Run("name", func(t *testing.T) { + assert := assert.New(t) + + span := testSpan() + span.Name = invalidUTF8 + + err := Normalize(span) + + assert.Nil(err) + assert.Equal("test", span.Name) + }) + + t.Run("type", func(t *testing.T) { + assert := assert.New(t) + + span := testSpan() + span.Type = invalidUTF8 + + err := Normalize(span) + + assert.Nil(err) + assert.Equal("test��", span.Type) + }) + + t.Run("meta", func(t *testing.T) { + assert := assert.New(t) + + span := testSpan() + span.Meta = map[string]string{ + invalidUTF8: "test1", + "test2": invalidUTF8, + } + + err := Normalize(span) + + assert.Nil(err) + assert.EqualValues(map[string]string{ + "test��": "test1", + "test2": "test��", + }, span.Meta) + }) +} + +func BenchmarkNormalization(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i += 1 { + Normalize(testSpan()) + } +} diff --git a/pkg/trace/agent/processed_trace.go b/pkg/trace/agent/processed_trace.go new file mode 100644 index 0000000000000..94070bb33c101 --- /dev/null +++ b/pkg/trace/agent/processed_trace.go @@ -0,0 +1,26 @@ +package agent + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +type ProcessedTrace struct { + Trace pb.Trace + WeightedTrace WeightedTrace + Root *pb.Span + Env string + Sublayers map[*pb.Span][]SublayerValue + Sampled bool +} + +func (pt *ProcessedTrace) Weight() float64 { + if pt.Root == nil { + return 1.0 + } + return sampler.Weight(pt.Root) +} + +func (pt *ProcessedTrace) GetSamplingPriority() (sampler.SamplingPriority, bool) { + return sampler.GetSamplingPriority(pt.Root) +} diff --git a/pkg/trace/agent/stats.go b/pkg/trace/agent/stats.go new file mode 100644 index 0000000000000..5975809958093 --- /dev/null +++ b/pkg/trace/agent/stats.go @@ -0,0 +1,157 @@ +package agent + +import ( + "fmt" + + "github.com/DataDog/datadog-agent/pkg/trace/quantile" +) + +// Hardcoded measures names for ease of reference +const ( + HITS string = "hits" + ERRORS = "errors" + DURATION = "duration" +) + +var ( + // DefaultCounts is an array of the measures we represent as Count by default + DefaultCounts = [...]string{HITS, ERRORS, DURATION} + // DefaultDistributions is an array of the measures we represent as Distribution by default + // Not really used right now as we don't have a way to easily add new distros + DefaultDistributions = [...]string{DURATION} +) + +// Count represents one specific "metric" we track for a given tagset +// A count keeps track of the total for a metric during a given time in a certain dimension. +// By default we keep count of "hits", "errors" and "durations". Others can be added +// (from the Metrics map in a span), but they have to be enabled manually. +// +// Example: hits between X and X+5s for service:dogweb and resource:dash.list +type Count struct { + Key string `json:"key"` + Name string `json:"name"` // the name of the trace/spans we count (was a member of TagSet) + Measure string `json:"measure"` // represents the entity we count, e.g. "hits", "errors", "time" (was Name) + TagSet TagSet `json:"tagset"` // set of tags for which we account this Distribution + + TopLevel float64 `json:"top_level"` // number of top-level spans contributing to this count + + Value float64 `json:"value"` // accumulated values +} + +// Distribution represents a true image of the spectrum of values, allowing arbitrary quantile queries +// A distribution works the same way Counts do, but instead of accumulating values it keeps a sense of +// the repartition of the values. It uses the Greenwald-Khanna online summary algorithm. +// +// A distribution can answer to an arbitrary quantile query within a given epsilon. For each "range" of +// values in our pseudo-histogram we keep a trace ID (a sample) so that we can give the user an example +// of a trace for a given quantile query. +type Distribution struct { + Key string `json:"key"` + Name string `json:"name"` // the name of the trace/spans we count (was a member of TagSet) + Measure string `json:"measure"` // represents the entity we count, e.g. "hits", "errors", "time" + TagSet TagSet `json:"tagset"` // set of tags for which we account this Distribution + + TopLevel float64 `json:"top_level"` // number of top-level spans contributing to this count + + Summary *quantile.SliceSummary `json:"summary"` // actual representation of data +} + +// GrainKey generates the key used to aggregate counts and distributions +// which is of the form: name|measure|aggr +// for example: serve|duration|service:webserver +func GrainKey(name, measure, aggr string) string { + return name + "|" + measure + "|" + aggr +} + +// NewCount returns a new Count for a metric and a given tag set +func NewCount(m, ckey, name string, tgs TagSet) Count { + return Count{ + Key: ckey, + Name: name, + Measure: m, + TagSet: tgs, // note: by doing this, tgs is a ref shared by all objects created with the same arg + Value: 0.0, + } +} + +// Add adds some values to one count +func (c Count) Add(v float64) Count { + c.Value += v + return c +} + +// Merge is used when 2 Counts represent the same thing and adds Values +func (c Count) Merge(c2 Count) Count { + if c.Key != c2.Key { + err := fmt.Errorf("Trying to merge non-homogoneous counts [%s] and [%s]", c.Key, c2.Key) + panic(err) + } + + c.Value += c2.Value + return c +} + +// NewDistribution returns a new Distribution for a metric and a given tag set +func NewDistribution(m, ckey, name string, tgs TagSet) Distribution { + return Distribution{ + Key: ckey, + Name: name, + Measure: m, + TagSet: tgs, // note: by doing this, tgs is a ref shared by all objects created with the same arg + Summary: quantile.NewSliceSummary(), + } +} + +// Add inserts the proper values in a given distribution from a span +func (d Distribution) Add(v float64, sampleID uint64) { + d.Summary.Insert(v, sampleID) +} + +// Merge is used when 2 Distributions represent the same thing and it merges the 2 underlying summaries +func (d Distribution) Merge(d2 Distribution) { + // We don't check tagsets for distributions as we reaggregate without reallocating new structs + d.Summary.Merge(d2.Summary) +} + +// Weigh applies a weight factor to a distribution and return the result as a +// new distribution. +func (d Distribution) Weigh(weight float64) Distribution { + d2 := Distribution(d) + d2.Summary = quantile.WeighSummary(d.Summary, weight) + return d2 +} + +// Copy returns a distro with the same data but a different underlying summary +func (d Distribution) Copy() Distribution { + d2 := Distribution(d) + d2.Summary = d.Summary.Copy() + return d2 +} + +// StatsBucket is a time bucket to track statistic around multiple Counts +type StatsBucket struct { + Start int64 // Timestamp of start in our format + Duration int64 // Duration of a bucket in nanoseconds + + // Stats indexed by keys + Counts map[string]Count // All the counts + Distributions map[string]Distribution // All the distributions (e.g.: for quantile queries) + ErrDistributions map[string]Distribution // All the error distributions (e.g.: for apdex, as they account for frustrated) +} + +// NewStatsBucket opens a new bucket for time ts and initializes it properly +func NewStatsBucket(ts, d int64) StatsBucket { + // The only non-initialized value is the Duration which should be set by whoever closes that bucket + return StatsBucket{ + Start: ts, + Duration: d, + Counts: make(map[string]Count), + Distributions: make(map[string]Distribution), + ErrDistributions: make(map[string]Distribution), + } +} + +// IsEmpty just says if this stats bucket has no information (in which case it's useless) +func (sb StatsBucket) IsEmpty() bool { + return len(sb.Counts) == 0 && len(sb.Distributions) == 0 && len(sb.ErrDistributions) == 0 +} diff --git a/pkg/trace/agent/stats_payload.go b/pkg/trace/agent/stats_payload.go new file mode 100644 index 0000000000000..e40901a455745 --- /dev/null +++ b/pkg/trace/agent/stats_payload.go @@ -0,0 +1,29 @@ +package agent + +import ( + "bytes" + "compress/gzip" + "encoding/json" +) + +// StatsPayload represents the payload to be flushed to the stats endpoint +type StatsPayload struct { + HostName string `json:"hostname"` + Env string `json:"env"` + Stats []StatsBucket `json:"stats"` +} + +// EncodeStatsPayload encodes the stats payload as json/gzip. +func EncodeStatsPayload(payload *StatsPayload) ([]byte, error) { + var b bytes.Buffer + var err error + + gz, err := gzip.NewWriterLevel(&b, gzip.BestSpeed) + if err != nil { + return nil, err + } + err = json.NewEncoder(gz).Encode(payload) + gz.Close() + + return b.Bytes(), err +} diff --git a/pkg/trace/agent/stats_test.go b/pkg/trace/agent/stats_test.go new file mode 100644 index 0000000000000..789e52624ec58 --- /dev/null +++ b/pkg/trace/agent/stats_test.go @@ -0,0 +1,639 @@ +package agent + +import ( + "bytes" + "fmt" + "strconv" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/quantile" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/stretchr/testify/assert" +) + +const defaultEnv = "default" + +func testWeightedSpans() WeightedTrace { + spans := []pb.Span{ + pb.Span{Service: "A", Name: "A.foo", Resource: "α", Duration: 1}, + pb.Span{Service: "A", Name: "A.foo", Resource: "β", Duration: 2, Error: 1}, + pb.Span{Service: "B", Name: "B.foo", Resource: "γ", Duration: 3}, + pb.Span{Service: "B", Name: "B.foo", Resource: "ε", Duration: 4, Error: 404}, + pb.Span{Service: "B", Name: "B.foo", Resource: "ζ", Duration: 5, Meta: map[string]string{"version": "1.3"}}, + pb.Span{Service: "B", Name: "sql.query", Resource: "ζ", Duration: 6, Meta: map[string]string{"version": "1.4"}}, + pb.Span{Service: "C", Name: "sql.query", Resource: "δ", Duration: 7}, + pb.Span{Service: "C", Name: "sql.query", Resource: "δ", Duration: 8}, + } + tws := make(WeightedTrace, len(spans)) + for i := range spans { + tws[i] = &WeightedSpan{ + Span: &spans[i], + Weight: 1, + TopLevel: true, + } + } + return tws +} + +func testTrace() pb.Trace { + // Data below represents a trace with some sublayers, so that we make sure, + // those data are correctly calculated when aggregating in HandleSpan() + // A |---------------------------------------------------------------| duration: 100 + // B |----------------------| duration: 20 + // C |-----| |---| duration: 5+3 + trace := pb.Trace{ + &pb.Span{TraceID: 42, SpanID: 42, ParentID: 0, Service: "A", + Name: "A.foo", Type: "web", Resource: "α", Start: 0, Duration: 100, + Metrics: map[string]float64{sampler.KeySamplingRateGlobal: 0.5}}, + &pb.Span{TraceID: 42, SpanID: 100, ParentID: 42, Service: "B", + Name: "B.bar", Type: "web", Resource: "α", Start: 1, Duration: 20}, + &pb.Span{TraceID: 42, SpanID: 2000, ParentID: 100, Service: "C", + Name: "sql.query", Type: "sql", Resource: "SELECT value FROM table", + Start: 2, Duration: 5}, + &pb.Span{TraceID: 42, SpanID: 3000, ParentID: 100, Service: "C", + Name: "sql.query", Type: "sql", Resource: "SELECT ololololo... value FROM table", + Start: 10, Duration: 3, Error: 1}, + } + + traceutil.ComputeTopLevel(trace) + return trace +} + +func testTraceTopLevel() pb.Trace { + // Data below represents a trace with some sublayers, so that we make sure, + // those data are correctly calculated when aggregating in HandleSpan() + // In this case, the sublayers B and C have been merged into B, + // showing what happens when some spans are not marked as top-level. + // A |---------------------------------------------------------------| duration: 100 + // B |----------------------| duration: 20 + // B |-----| |---| duration: 5+3 + trace := pb.Trace{ + &pb.Span{TraceID: 42, SpanID: 42, ParentID: 0, Service: "A", + Name: "A.foo", Type: "web", Resource: "α", Start: 0, Duration: 100, + Metrics: map[string]float64{sampler.KeySamplingRateGlobal: 1}}, + &pb.Span{TraceID: 42, SpanID: 100, ParentID: 42, Service: "B", + Name: "B.bar", Type: "web", Resource: "α", Start: 1, Duration: 20}, + &pb.Span{TraceID: 42, SpanID: 2000, ParentID: 100, Service: "B", + Name: "B.bar.1", Type: "web", Resource: "α", + Start: 2, Duration: 5}, + &pb.Span{TraceID: 42, SpanID: 3000, ParentID: 100, Service: "B", + Name: "B.bar.2", Type: "web", Resource: "α", + Start: 10, Duration: 3, Error: 1}, + } + + traceutil.ComputeTopLevel(trace) + return trace +} + +func TestGrainKey(t *testing.T) { + assert := assert.New(t) + gk := GrainKey("serve", "duration", "service:webserver") + assert.Equal("serve|duration|service:webserver", gk) +} + +type expectedCount struct { + value float64 + topLevel float64 +} + +type expectedDistribution struct { + entries []quantile.Entry + topLevel float64 +} + +func TestStatsBucketDefault(t *testing.T) { + assert := assert.New(t) + + srb := NewStatsRawBucket(0, 1e9) + + // No custom aggregators only the defaults + aggr := []string{} + for _, s := range testWeightedSpans() { + t.Logf("weight: %f, topLevel: %v", s.Weight, s.TopLevel) + srb.HandleSpan(s, defaultEnv, aggr, nil) + } + sb := srb.Export() + + expectedCounts := map[string]expectedCount{ + "A.foo|duration|env:default,resource:α,service:A": expectedCount{value: 1, topLevel: 1}, + "A.foo|duration|env:default,resource:β,service:A": expectedCount{value: 2, topLevel: 1}, + "B.foo|duration|env:default,resource:γ,service:B": expectedCount{value: 3, topLevel: 1}, + "B.foo|duration|env:default,resource:ε,service:B": expectedCount{value: 4, topLevel: 1}, + "B.foo|duration|env:default,resource:ζ,service:B": expectedCount{value: 5, topLevel: 1}, + "sql.query|duration|env:default,resource:ζ,service:B": expectedCount{value: 6, topLevel: 1}, + "sql.query|duration|env:default,resource:δ,service:C": expectedCount{value: 15, topLevel: 2}, + "A.foo|errors|env:default,resource:α,service:A": expectedCount{value: 0, topLevel: 1}, + "A.foo|errors|env:default,resource:β,service:A": expectedCount{value: 1, topLevel: 1}, + "B.foo|errors|env:default,resource:γ,service:B": expectedCount{value: 0, topLevel: 1}, + "B.foo|errors|env:default,resource:ε,service:B": expectedCount{value: 1, topLevel: 1}, + "B.foo|errors|env:default,resource:ζ,service:B": expectedCount{value: 0, topLevel: 1}, + "sql.query|errors|env:default,resource:ζ,service:B": expectedCount{value: 0, topLevel: 1}, + "sql.query|errors|env:default,resource:δ,service:C": expectedCount{value: 0, topLevel: 2}, + "A.foo|hits|env:default,resource:α,service:A": expectedCount{value: 1, topLevel: 1}, + "A.foo|hits|env:default,resource:β,service:A": expectedCount{value: 1, topLevel: 1}, + "B.foo|hits|env:default,resource:γ,service:B": expectedCount{value: 1, topLevel: 1}, + "B.foo|hits|env:default,resource:ε,service:B": expectedCount{value: 1, topLevel: 1}, + "B.foo|hits|env:default,resource:ζ,service:B": expectedCount{value: 1, topLevel: 1}, + "sql.query|hits|env:default,resource:ζ,service:B": expectedCount{value: 1, topLevel: 1}, + "sql.query|hits|env:default,resource:δ,service:C": expectedCount{value: 2, topLevel: 2}, + } + + assert.Len(sb.Counts, len(expectedCounts), "Missing counts!") + for ckey, c := range sb.Counts { + val, ok := expectedCounts[ckey] + if !ok { + assert.Fail("Unexpected count %s", ckey) + } + assert.Equal(val.value, c.Value, "Count %s wrong value", ckey) + assert.Equal(val.topLevel, c.TopLevel, "Count %s wrong topLevel", ckey) + } + + expectedDistributions := map[string]expectedDistribution{ + "A.foo|duration|env:default,resource:α,service:A": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 1, G: 1, Delta: 0}}, topLevel: 1}, + "A.foo|duration|env:default,resource:β,service:A": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 2, G: 1, Delta: 0}}, topLevel: 1}, + "B.foo|duration|env:default,resource:γ,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 3, G: 1, Delta: 0}}, topLevel: 1}, + "B.foo|duration|env:default,resource:ε,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 4, G: 1, Delta: 0}}, topLevel: 1}, + "B.foo|duration|env:default,resource:ζ,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 5, G: 1, Delta: 0}}, topLevel: 1}, + "sql.query|duration|env:default,resource:ζ,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 6, G: 1, Delta: 0}}, topLevel: 1}, + "sql.query|duration|env:default,resource:δ,service:C": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 7, G: 1, Delta: 0}, quantile.Entry{V: 8, G: 1, Delta: 0}}, topLevel: 2}, + } + + for k, v := range sb.Distributions { + t.Logf("%v: %v", k, v.Summary.Entries) + } + assert.Len(sb.Distributions, len(expectedDistributions), "Missing distributions!") + for dkey, d := range sb.Distributions { + val, ok := expectedDistributions[dkey] + if !ok { + assert.Fail("Unexpected distribution %s", dkey) + } + assert.Equal(val.entries, d.Summary.Entries, "Distribution %s wrong value", dkey) + assert.Equal(val.topLevel, d.TopLevel, "Distribution %s wrong topLevel", dkey) + } + + expectedErrDistributions := map[string]expectedDistribution{ + "A.foo|duration|env:default,resource:α,service:A": expectedDistribution{ + entries: nil, topLevel: 1}, + "A.foo|duration|env:default,resource:β,service:A": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 2, G: 1, Delta: 0}}, topLevel: 1}, + "B.foo|duration|env:default,resource:γ,service:B": expectedDistribution{ + entries: nil, topLevel: 1}, + "B.foo|duration|env:default,resource:ε,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 4, G: 1, Delta: 0}}, topLevel: 1}, + "B.foo|duration|env:default,resource:ζ,service:B": expectedDistribution{ + entries: nil, topLevel: 1}, + "sql.query|duration|env:default,resource:ζ,service:B": expectedDistribution{ + entries: nil, topLevel: 1}, + "sql.query|duration|env:default,resource:δ,service:C": expectedDistribution{ + entries: nil, topLevel: 2}, + } + + for k, v := range sb.ErrDistributions { + t.Logf("%v: %v", k, v.Summary.Entries) + } + assert.Len(sb.ErrDistributions, len(expectedErrDistributions), "Missing distributions!") + for dkey, d := range sb.ErrDistributions { + val, ok := expectedErrDistributions[dkey] + if !ok { + assert.Fail("Unexpected distribution %s", dkey) + } + assert.Equal(val.entries, d.Summary.Entries, "ErrDistribution %s wrong value", dkey) + assert.Equal(val.topLevel, d.TopLevel, "ErrDistribution %s wrong topLevel", dkey) + } +} + +func TestStatsBucketExtraAggregators(t *testing.T) { + assert := assert.New(t) + + srb := NewStatsRawBucket(0, 1e9) + + // one custom aggregator + aggr := []string{"version"} + for _, s := range testWeightedSpans() { + srb.HandleSpan(s, defaultEnv, aggr, nil) + } + sb := srb.Export() + + expectedCounts := map[string]expectedCount{ + "A.foo|duration|env:default,resource:α,service:A": expectedCount{value: 1, topLevel: 1}, + "A.foo|duration|env:default,resource:β,service:A": expectedCount{value: 2, topLevel: 1}, + "B.foo|duration|env:default,resource:γ,service:B": expectedCount{value: 3, topLevel: 1}, + "B.foo|duration|env:default,resource:ε,service:B": expectedCount{value: 4, topLevel: 1}, + "sql.query|duration|env:default,resource:δ,service:C": expectedCount{value: 15, topLevel: 2}, + "A.foo|errors|env:default,resource:α,service:A": expectedCount{value: 0, topLevel: 1}, + "A.foo|errors|env:default,resource:β,service:A": expectedCount{value: 1, topLevel: 1}, + "B.foo|errors|env:default,resource:γ,service:B": expectedCount{value: 0, topLevel: 1}, + "B.foo|errors|env:default,resource:ε,service:B": expectedCount{value: 1, topLevel: 1}, + "sql.query|errors|env:default,resource:δ,service:C": expectedCount{value: 0, topLevel: 2}, + "A.foo|hits|env:default,resource:α,service:A": expectedCount{value: 1, topLevel: 1}, + "A.foo|hits|env:default,resource:β,service:A": expectedCount{value: 1, topLevel: 1}, + "B.foo|hits|env:default,resource:γ,service:B": expectedCount{value: 1, topLevel: 1}, + "B.foo|hits|env:default,resource:ε,service:B": expectedCount{value: 1, topLevel: 1}, + "sql.query|hits|env:default,resource:δ,service:C": expectedCount{value: 2, topLevel: 2}, + "sql.query|errors|env:default,resource:ζ,service:B,version:1.4": expectedCount{value: 0, topLevel: 1}, + "sql.query|hits|env:default,resource:ζ,service:B,version:1.4": expectedCount{value: 1, topLevel: 1}, + "sql.query|duration|env:default,resource:ζ,service:B,version:1.4": expectedCount{value: 6, topLevel: 1}, + "B.foo|errors|env:default,resource:ζ,service:B,version:1.3": expectedCount{value: 0, topLevel: 1}, + "B.foo|duration|env:default,resource:ζ,service:B,version:1.3": expectedCount{value: 5, topLevel: 1}, + "B.foo|hits|env:default,resource:ζ,service:B,version:1.3": expectedCount{value: 1, topLevel: 1}, + } + + assert.Len(sb.Counts, len(expectedCounts), "Missing counts!") + for ckey, c := range sb.Counts { + val, ok := expectedCounts[ckey] + if !ok { + assert.Fail("Unexpected count %s", ckey) + } + assert.Equal(val.value, c.Value, "Count %s wrong value", ckey) + assert.Equal(val.topLevel, c.TopLevel, "Count %s wrong topLevel", ckey) + keyFields := strings.Split(ckey, "|") + tags := NewTagSetFromString(keyFields[2]) + assert.Equal(tags, c.TagSet, "bad tagset for count %s", ckey) + } +} + +func TestStatsBucketMany(t *testing.T) { + if testing.Short() { + return + } + + assert := assert.New(t) + + templateSpan := &WeightedSpan{ + Span: &pb.Span{Service: "A", Name: "A.foo", Resource: "α", Duration: 7}, + Weight: 1, + TopLevel: true, + } + const n = 100000 + + srb := NewStatsRawBucket(0, 1e9) + + // No custom aggregators only the defaults + aggr := []string{} + for i := 0; i < n; i++ { + s := templateSpan + s.Resource = "α" + strconv.Itoa(i) + srbCopy := *srb + srbCopy.HandleSpan(s, defaultEnv, aggr, nil) + } + sb := srb.Export() + + assert.Len(sb.Counts, 3*n, "Missing counts %d != %d", len(sb.Counts), 3*n) + for ckey, c := range sb.Counts { + if strings.Contains(ckey, "|duration|") { + assert.Equal(7.0, c.Value, "duration %s wrong value", ckey) + } + if strings.Contains(ckey, "|errors|") { + assert.Equal(0.0, c.Value, "errors %s wrong value", ckey) + } + if strings.Contains(ckey, "|hits|") { + assert.Equal(1.0, c.Value, "hits %s wrong value", ckey) + } + } +} + +func TestStatsBucketSublayers(t *testing.T) { + assert := assert.New(t) + + tr := testTrace() + sublayers := ComputeSublayers(tr) + root := traceutil.GetRoot(tr) + SetSublayersOnSpan(root, sublayers) + + wt := NewWeightedTrace(tr, root) + + assert.NotNil(sublayers) + + srb := NewStatsRawBucket(0, 1e9) + + // No custom aggregators only the defaults + aggr := []string{} + for _, s := range wt { + srb.HandleSpan(s, defaultEnv, aggr, sublayers) + } + sb := srb.Export() + + expectedCounts := map[string]expectedCount{ + "A.foo|_sublayers.duration.by_service|env:default,resource:α,service:A,sublayer_service:A": expectedCount{value: 160, topLevel: 2}, + "A.foo|_sublayers.duration.by_service|env:default,resource:α,service:A,sublayer_service:B": expectedCount{value: 24, topLevel: 2}, + "A.foo|_sublayers.duration.by_service|env:default,resource:α,service:A,sublayer_service:C": expectedCount{value: 16, topLevel: 2}, + "A.foo|_sublayers.duration.by_type|env:default,resource:α,service:A,sublayer_type:sql": expectedCount{value: 16, topLevel: 2}, + "A.foo|_sublayers.duration.by_type|env:default,resource:α,service:A,sublayer_type:web": expectedCount{value: 184, topLevel: 2}, + "A.foo|_sublayers.span_count|env:default,resource:α,service:A,:": expectedCount{value: 8, topLevel: 2}, + "A.foo|duration|env:default,resource:α,service:A": expectedCount{value: 200, topLevel: 2}, + "A.foo|errors|env:default,resource:α,service:A": expectedCount{value: 0, topLevel: 2}, + "A.foo|hits|env:default,resource:α,service:A": expectedCount{value: 2, topLevel: 2}, + "B.bar|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:A": expectedCount{value: 160, topLevel: 2}, + "B.bar|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:B": expectedCount{value: 24, topLevel: 2}, + "B.bar|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:C": expectedCount{value: 16, topLevel: 2}, + "B.bar|_sublayers.duration.by_type|env:default,resource:α,service:B,sublayer_type:sql": expectedCount{value: 16, topLevel: 2}, + "B.bar|_sublayers.duration.by_type|env:default,resource:α,service:B,sublayer_type:web": expectedCount{value: 184, topLevel: 2}, + "B.bar|_sublayers.span_count|env:default,resource:α,service:B,:": expectedCount{value: 8, topLevel: 2}, + "B.bar|duration|env:default,resource:α,service:B": expectedCount{value: 40, topLevel: 2}, + "B.bar|errors|env:default,resource:α,service:B": expectedCount{value: 0, topLevel: 2}, + "B.bar|hits|env:default,resource:α,service:B": expectedCount{value: 2, topLevel: 2}, + "sql.query|_sublayers.duration.by_service|env:default,resource:SELECT ololololo... value FROM table,service:C,sublayer_service:A": expectedCount{value: 160, topLevel: 2}, + "sql.query|_sublayers.duration.by_service|env:default,resource:SELECT ololololo... value FROM table,service:C,sublayer_service:B": expectedCount{value: 24, topLevel: 2}, + "sql.query|_sublayers.duration.by_service|env:default,resource:SELECT ololololo... value FROM table,service:C,sublayer_service:C": expectedCount{value: 16, topLevel: 2}, + "sql.query|_sublayers.duration.by_service|env:default,resource:SELECT value FROM table,service:C,sublayer_service:A": expectedCount{value: 160, topLevel: 2}, + "sql.query|_sublayers.duration.by_service|env:default,resource:SELECT value FROM table,service:C,sublayer_service:B": expectedCount{value: 24, topLevel: 2}, + "sql.query|_sublayers.duration.by_service|env:default,resource:SELECT value FROM table,service:C,sublayer_service:C": expectedCount{value: 16, topLevel: 2}, + "sql.query|_sublayers.duration.by_type|env:default,resource:SELECT ololololo... value FROM table,service:C,sublayer_type:sql": expectedCount{value: 16, topLevel: 2}, + "sql.query|_sublayers.duration.by_type|env:default,resource:SELECT ololololo... value FROM table,service:C,sublayer_type:web": expectedCount{value: 184, topLevel: 2}, + "sql.query|_sublayers.duration.by_type|env:default,resource:SELECT value FROM table,service:C,sublayer_type:sql": expectedCount{value: 16, topLevel: 2}, + "sql.query|_sublayers.duration.by_type|env:default,resource:SELECT value FROM table,service:C,sublayer_type:web": expectedCount{value: 184, topLevel: 2}, + "sql.query|_sublayers.span_count|env:default,resource:SELECT ololololo... value FROM table,service:C,:": expectedCount{value: 8, topLevel: 2}, + "sql.query|_sublayers.span_count|env:default,resource:SELECT value FROM table,service:C,:": expectedCount{value: 8, topLevel: 2}, + "sql.query|duration|env:default,resource:SELECT ololololo... value FROM table,service:C": expectedCount{value: 6, topLevel: 2}, + "sql.query|duration|env:default,resource:SELECT value FROM table,service:C": expectedCount{value: 10, topLevel: 2}, + "sql.query|errors|env:default,resource:SELECT ololololo... value FROM table,service:C": expectedCount{value: 2, topLevel: 2}, + "sql.query|errors|env:default,resource:SELECT value FROM table,service:C": expectedCount{value: 0, topLevel: 2}, + "sql.query|hits|env:default,resource:SELECT ololololo... value FROM table,service:C": expectedCount{value: 2, topLevel: 2}, + "sql.query|hits|env:default,resource:SELECT value FROM table,service:C": expectedCount{value: 2, topLevel: 2}, + } + + assert.Len(sb.Counts, len(expectedCounts), "Missing counts!") + for ckey, c := range sb.Counts { + val, ok := expectedCounts[ckey] + if !ok { + assert.Fail("Unexpected count %s", ckey) + } + assert.Equal(val.value, c.Value, "Count %s wrong value", ckey) + assert.Equal(val.topLevel, c.TopLevel, "Count %s wrong topLevel", ckey) + keyFields := strings.Split(ckey, "|") + tags := NewTagSetFromString(keyFields[2]) + assert.Equal(tags, c.TagSet, "bad tagset for count %s", ckey) + } + + expectedDistributions := map[string]expectedDistribution{ + "A.foo|duration|env:default,resource:α,service:A": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 100, G: 1, Delta: 0}}, topLevel: 2}, + "B.bar|duration|env:default,resource:α,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 20, G: 1, Delta: 0}}, topLevel: 2}, + "sql.query|duration|env:default,resource:SELECT value FROM table,service:C": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 5, G: 1, Delta: 0}}, topLevel: 2}, + "sql.query|duration|env:default,resource:SELECT ololololo... value FROM table,service:C": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 3, G: 1, Delta: 0}}, topLevel: 2}, + } + + assert.Len(sb.Distributions, len(expectedDistributions), "Missing distributions!") + for dkey, d := range sb.Distributions { + val, ok := expectedDistributions[dkey] + if !ok { + assert.Fail("Unexpected distribution %s", dkey) + } + assert.Equal(val.entries, d.Summary.Entries, "Distribution %s wrong value", dkey) + assert.Equal(val.topLevel, d.TopLevel, "Distribution %s wrong topLevel", dkey) + keyFields := strings.Split(dkey, "|") + tags := NewTagSetFromString(keyFields[2]) + assert.Equal(tags, d.TagSet, "bad tagset for distribution %s", dkey) + } +} + +func TestStatsBucketSublayersTopLevel(t *testing.T) { + assert := assert.New(t) + + tr := testTraceTopLevel() + sublayers := ComputeSublayers(tr) + root := traceutil.GetRoot(tr) + SetSublayersOnSpan(root, sublayers) + + wt := NewWeightedTrace(tr, root) + + assert.NotNil(sublayers) + + srb := NewStatsRawBucket(0, 1e9) + + // No custom aggregators only the defaults + aggr := []string{} + for _, s := range wt { + srb.HandleSpan(s, defaultEnv, aggr, sublayers) + } + sb := srb.Export() + + expectedCounts := map[string]expectedCount{ + "A.foo|_sublayers.duration.by_service|env:default,resource:α,service:A,sublayer_service:A": expectedCount{value: 80, topLevel: 1}, + "A.foo|_sublayers.duration.by_service|env:default,resource:α,service:A,sublayer_service:B": expectedCount{value: 20, topLevel: 1}, + "A.foo|_sublayers.duration.by_type|env:default,resource:α,service:A,sublayer_type:web": expectedCount{value: 100, topLevel: 1}, + "A.foo|_sublayers.span_count|env:default,resource:α,service:A,:": expectedCount{value: 4, topLevel: 1}, + "A.foo|hits|env:default,resource:α,service:A": expectedCount{value: 1, topLevel: 1}, + "A.foo|errors|env:default,resource:α,service:A": expectedCount{value: 0, topLevel: 1}, + "A.foo|duration|env:default,resource:α,service:A": expectedCount{value: 100, topLevel: 1}, + "B.bar|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:A": expectedCount{value: 80, topLevel: 1}, + "B.bar|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:B": expectedCount{value: 20, topLevel: 1}, + "B.bar|_sublayers.duration.by_type|env:default,resource:α,service:B,sublayer_type:web": expectedCount{value: 100, topLevel: 1}, + "B.bar|_sublayers.span_count|env:default,resource:α,service:B,:": expectedCount{value: 4, topLevel: 1}, + "B.bar|hits|env:default,resource:α,service:B": expectedCount{value: 1, topLevel: 1}, + "B.bar|errors|env:default,resource:α,service:B": expectedCount{value: 0, topLevel: 1}, + "B.bar|duration|env:default,resource:α,service:B": expectedCount{value: 20, topLevel: 1}, + // [TODO] the ultimate target is to *NOT* compute & store the counts below, which have topLevel == 0 + "B.bar.1|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:A": expectedCount{value: 80, topLevel: 0}, + "B.bar.1|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:B": expectedCount{value: 20, topLevel: 0}, + "B.bar.1|_sublayers.duration.by_type|env:default,resource:α,service:B,sublayer_type:web": expectedCount{value: 100, topLevel: 0}, + "B.bar.1|_sublayers.span_count|env:default,resource:α,service:B,:": expectedCount{value: 4, topLevel: 0}, + "B.bar.1|hits|env:default,resource:α,service:B": expectedCount{value: 1, topLevel: 0}, + "B.bar.1|errors|env:default,resource:α,service:B": expectedCount{value: 0, topLevel: 0}, + "B.bar.1|duration|env:default,resource:α,service:B": expectedCount{value: 5, topLevel: 0}, + "B.bar.2|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:A": expectedCount{value: 80, topLevel: 0}, + "B.bar.2|_sublayers.duration.by_service|env:default,resource:α,service:B,sublayer_service:B": expectedCount{value: 20, topLevel: 0}, + "B.bar.2|_sublayers.duration.by_type|env:default,resource:α,service:B,sublayer_type:web": expectedCount{value: 100, topLevel: 0}, + "B.bar.2|_sublayers.span_count|env:default,resource:α,service:B,:": expectedCount{value: 4, topLevel: 0}, + "B.bar.2|hits|env:default,resource:α,service:B": expectedCount{value: 1, topLevel: 0}, + "B.bar.2|errors|env:default,resource:α,service:B": expectedCount{value: 1, topLevel: 0}, + "B.bar.2|duration|env:default,resource:α,service:B": expectedCount{value: 3, topLevel: 0}, + } + + assert.Len(sb.Counts, len(expectedCounts), "Missing counts!") + for ckey, c := range sb.Counts { + val, ok := expectedCounts[ckey] + if !ok { + assert.Fail("Unexpected count %s", ckey) + } + assert.Equal(val.value, c.Value, "Count %s wrong value", ckey) + assert.Equal(val.topLevel, c.TopLevel, "Count %s wrong topLevel", ckey) + keyFields := strings.Split(ckey, "|") + tags := NewTagSetFromString(keyFields[2]) + assert.Equal(tags, c.TagSet, "bad tagset for count %s", ckey) + } + + expectedDistributions := map[string]expectedDistribution{ + "A.foo|duration|env:default,resource:α,service:A": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 100, G: 1, Delta: 0}}, topLevel: 1}, + "B.bar|duration|env:default,resource:α,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 20, G: 1, Delta: 0}}, topLevel: 1}, + // [TODO] the ultimate target is to *NOT* compute & store the counts below, which have topLevel == 0 + "B.bar.1|duration|env:default,resource:α,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 5, G: 1, Delta: 0}}, topLevel: 0}, + "B.bar.2|duration|env:default,resource:α,service:B": expectedDistribution{ + entries: []quantile.Entry{quantile.Entry{V: 3, G: 1, Delta: 0}}, topLevel: 0}, + } + + assert.Len(sb.Distributions, len(expectedDistributions), "Missing distributions!") + for dkey, d := range sb.Distributions { + val, ok := expectedDistributions[dkey] + if !ok { + assert.Fail("Unexpected distribution %s", dkey) + } + assert.Equal(val.entries, d.Summary.Entries, "Distribution %s wrong value", dkey) + assert.Equal(val.topLevel, d.TopLevel, "Distribution %s wrong topLevel", dkey) + keyFields := strings.Split(dkey, "|") + tags := NewTagSetFromString(keyFields[2]) + assert.Equal(tags, d.TagSet, "bad tagset for distribution %s", dkey) + } +} + +func TestTsRounding(t *testing.T) { + assert := assert.New(t) + + durations := []int64{ + 3 * 1e9, // 10110010110100000101111000000000 -> 10110010110000000000000000000000 = 2998927360 + 32432874923, // 11110001101001001100110010110101011 -> 11110001100000000000000000000000000 = 32413581312 + 1000, // Keep it with full precision + 45, // Keep it with full precision + 41000234, // 10011100011001110100101010 -> 10011100010000000000000000 = 40960000 + } + + type testcase struct { + res time.Duration + exp []float64 + } + + exp := []float64{2998927360, 32413581312, 1000, 45, 40960000} + + results := []float64{} + for _, d := range durations { + results = append(results, nsTimestampToFloat(d)) + } + assert.Equal(exp, results, "Unproper rounding of timestamp") +} + +func BenchmarkHandleSpan(b *testing.B) { + + srb := NewStatsRawBucket(0, 1e9) + aggr := []string{} + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + for _, s := range testWeightedSpans() { + srb.HandleSpan(s, defaultEnv, aggr, nil) + } + } +} + +func BenchmarkHandleSpanSublayers(b *testing.B) { + + srb := NewStatsRawBucket(0, 1e9) + aggr := []string{} + + tr := testTrace() + sublayers := ComputeSublayers(tr) + root := traceutil.GetRoot(tr) + SetSublayersOnSpan(root, sublayers) + + wt := NewWeightedTrace(tr, root) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + for _, s := range wt { + srb.HandleSpan(s, defaultEnv, aggr, sublayers) + } + } +} + +// it's important to have these defined as var and not const/inline +// else compiler performs compile-time optimization when using + with strings +var grainName = "mysql.query" +var grainMeasure = "duration" +var grainAggr = "resource:SELECT * FROM stuff,service:mysql" + +// testing out various way of doing string ops, to check which one is most efficient +func BenchmarkGrainKey(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = GrainKey(grainName, grainMeasure, grainAggr) + } +} + +func BenchmarkStringPlus(b *testing.B) { + if testing.Short() { + return + } + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = grainName + "|" + grainMeasure + "|" + grainAggr + } +} + +func BenchmarkSprintf(b *testing.B) { + if testing.Short() { + return + } + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = fmt.Sprintf("%s|%s|%s", grainName, grainMeasure, grainAggr) + } +} + +func BenchmarkBufferWriteByte(b *testing.B) { + if testing.Short() { + return + } + var buf bytes.Buffer + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + buf.Reset() + buf.WriteString(grainName) + buf.WriteByte('|') + buf.WriteString(grainMeasure) + buf.WriteByte('|') + buf.WriteString(grainAggr) + _ = buf.String() + } +} + +func BenchmarkBufferWriteRune(b *testing.B) { + if testing.Short() { + return + } + var buf bytes.Buffer + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + buf.Reset() + buf.WriteString(grainName) + buf.WriteRune('|') + buf.WriteString(grainMeasure) + buf.WriteRune('|') + buf.WriteString(grainAggr) + _ = buf.String() + } +} + +func BenchmarkStringsJoin(b *testing.B) { + if testing.Short() { + return + } + a := []string{grainName, grainMeasure, grainAggr} + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = strings.Join(a, "|") + } +} diff --git a/pkg/trace/agent/statsraw.go b/pkg/trace/agent/statsraw.go new file mode 100644 index 0000000000000..d710b17e90e54 --- /dev/null +++ b/pkg/trace/agent/statsraw.go @@ -0,0 +1,280 @@ +package agent + +import ( + "bytes" + "sort" + + "github.com/DataDog/datadog-agent/pkg/trace/quantile" +) + +// Most "algorithm" stuff here is tested with stats_test.go as what is important +// is that the final data, the one with send after a call to Export(), is correct. + +type groupedStats struct { + tags TagSet + + topLevel float64 + + hits float64 + errors float64 + duration float64 + durationDistribution *quantile.SliceSummary + errDurationDistribution *quantile.SliceSummary +} + +type sublayerStats struct { + tags TagSet + + topLevel float64 + + value int64 +} + +func newGroupedStats(tags TagSet) groupedStats { + return groupedStats{ + tags: tags, + durationDistribution: quantile.NewSliceSummary(), + errDurationDistribution: quantile.NewSliceSummary(), + } +} + +func newSublayerStats(tags TagSet) sublayerStats { + return sublayerStats{ + tags: tags, + } +} + +type statsKey struct { + name string + aggr string +} + +type statsSubKey struct { + name string + measure string + aggr string +} + +// StatsRawBucket is used to compute span data and aggregate it +// within a time-framed bucket. This should not be used outside +// the agent, use StatsBucket for this. +type StatsRawBucket struct { + // This should really have no public fields. At all. + + start int64 // timestamp of start in our format + duration int64 // duration of a bucket in nanoseconds + + // this should really remain private as it's subject to refactoring + data map[statsKey]groupedStats + sublayerData map[statsSubKey]sublayerStats + + // internal buffer for aggregate strings - not threadsafe + keyBuf bytes.Buffer +} + +// NewStatsRawBucket opens a new calculation bucket for time ts and initializes it properly +func NewStatsRawBucket(ts, d int64) *StatsRawBucket { + // The only non-initialized value is the Duration which should be set by whoever closes that bucket + return &StatsRawBucket{ + start: ts, + duration: d, + data: make(map[statsKey]groupedStats), + sublayerData: make(map[statsSubKey]sublayerStats), + } +} + +// Export transforms a StatsRawBucket into a StatsBucket, typically used +// before communicating data to the API, as StatsRawBucket is the internal +// type while StatsBucket is the public, shared one. +func (sb *StatsRawBucket) Export() StatsBucket { + ret := NewStatsBucket(sb.start, sb.duration) + for k, v := range sb.data { + hitsKey := GrainKey(k.name, HITS, k.aggr) + ret.Counts[hitsKey] = Count{ + Key: hitsKey, + Name: k.name, + Measure: HITS, + TagSet: v.tags, + TopLevel: v.topLevel, + Value: float64(v.hits), + } + errorsKey := GrainKey(k.name, ERRORS, k.aggr) + ret.Counts[errorsKey] = Count{ + Key: errorsKey, + Name: k.name, + Measure: ERRORS, + TagSet: v.tags, + TopLevel: v.topLevel, + Value: float64(v.errors), + } + durationKey := GrainKey(k.name, DURATION, k.aggr) + ret.Counts[durationKey] = Count{ + Key: durationKey, + Name: k.name, + Measure: DURATION, + TagSet: v.tags, + TopLevel: v.topLevel, + Value: float64(v.duration), + } + ret.Distributions[durationKey] = Distribution{ + Key: durationKey, + Name: k.name, + Measure: DURATION, + TagSet: v.tags, + TopLevel: v.topLevel, + Summary: v.durationDistribution, + } + ret.ErrDistributions[durationKey] = Distribution{ + Key: durationKey, + Name: k.name, + Measure: DURATION, + TagSet: v.tags, + TopLevel: v.topLevel, + Summary: v.errDurationDistribution, + } + } + for k, v := range sb.sublayerData { + key := GrainKey(k.name, k.measure, k.aggr) + ret.Counts[key] = Count{ + Key: key, + Name: k.name, + Measure: k.measure, + TagSet: v.tags, + TopLevel: v.topLevel, + Value: float64(v.value), + } + } + return ret +} + +func assembleGrain(b *bytes.Buffer, env, resource, service string, m map[string]string) (string, TagSet) { + b.Reset() + + b.WriteString("env:") + b.WriteString(env) + b.WriteString(",resource:") + b.WriteString(resource) + b.WriteString(",service:") + b.WriteString(service) + + tagset := TagSet{{"env", env}, {"resource", resource}, {"service", service}} + + if m == nil || len(m) == 0 { + return b.String(), tagset + } + + keys := make([]string, len(m)) + j := 0 + for k := range m { + keys[j] = k + j++ + } + + sort.Strings(keys) // required else aggregations would not work + + for _, key := range keys { + b.WriteRune(',') + b.WriteString(key) + b.WriteRune(':') + b.WriteString(m[key]) + tagset = append(tagset, Tag{key, m[key]}) + } + + return b.String(), tagset +} + +// HandleSpan adds the span to this bucket stats, aggregated with the finest grain matching given aggregators +func (sb *StatsRawBucket) HandleSpan(s *WeightedSpan, env string, aggregators []string, sublayers []SublayerValue) { + if env == "" { + panic("env should never be empty") + } + + m := make(map[string]string) + + for _, agg := range aggregators { + if agg != "env" && agg != "resource" && agg != "service" { + if v, ok := s.Meta[agg]; ok { + m[agg] = v + } + } + } + + grain, tags := assembleGrain(&sb.keyBuf, env, s.Resource, s.Service, m) + sb.add(s, grain, tags) + + for _, sub := range sublayers { + sb.addSublayer(s, grain, tags, sub) + } +} + +func (sb *StatsRawBucket) add(s *WeightedSpan, aggr string, tags TagSet) { + var gs groupedStats + var ok bool + + key := statsKey{name: s.Name, aggr: aggr} + if gs, ok = sb.data[key]; !ok { + gs = newGroupedStats(tags) + } + + if s.TopLevel { + gs.topLevel += s.Weight + } + + gs.hits += s.Weight + if s.Error != 0 { + gs.errors += s.Weight + } + gs.duration += float64(s.Duration) * s.Weight + + // TODO add for s.Metrics ability to define arbitrary counts and distros, check some config? + // alter resolution of duration distro + trundur := nsTimestampToFloat(s.Duration) + gs.durationDistribution.Insert(trundur, s.SpanID) + + if s.Error != 0 { + gs.errDurationDistribution.Insert(trundur, s.SpanID) + } + + sb.data[key] = gs +} + +func (sb *StatsRawBucket) addSublayer(s *WeightedSpan, aggr string, tags TagSet, sub SublayerValue) { + // This is not as efficient as a "regular" add as we don't update + // all sublayers at once (one call for HITS, and another one for ERRORS, DURATION...) + // when logically, if we have a sublayer for HITS, we also have one for DURATION, + // they should indeed come together. Still room for improvement here. + + var ss sublayerStats + var ok bool + + subAggr := aggr + "," + sub.Tag.Name + ":" + sub.Tag.Value + subTags := make(TagSet, len(tags)+1) + copy(subTags, tags) + subTags[len(tags)] = sub.Tag + + key := statsSubKey{name: s.Name, measure: sub.Metric, aggr: subAggr} + if ss, ok = sb.sublayerData[key]; !ok { + ss = newSublayerStats(subTags) + } + + if s.TopLevel { + ss.topLevel += s.Weight + } + + ss.value += int64(s.Weight * sub.Value) + + sb.sublayerData[key] = ss +} + +// 10 bits precision (any value will be +/- 1/1024) +const roundMask int64 = 1 << 10 + +// nsTimestampToFloat converts a nanosec timestamp into a float nanosecond timestamp truncated to a fixed precision +func nsTimestampToFloat(ns int64) float64 { + var shift uint + for ns > roundMask { + ns = ns >> 1 + shift++ + } + return float64(ns << shift) +} diff --git a/pkg/trace/agent/statsraw_test.go b/pkg/trace/agent/statsraw_test.go new file mode 100644 index 0000000000000..62737b40edcc9 --- /dev/null +++ b/pkg/trace/agent/statsraw_test.go @@ -0,0 +1,30 @@ +package agent + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func TestGrain(t *testing.T) { + srb := NewStatsRawBucket(0, 1e9) + assert := assert.New(t) + + s := pb.Span{Service: "thing", Name: "other", Resource: "yo"} + aggr, tgs := assembleGrain(&srb.keyBuf, "default", s.Resource, s.Service, nil) + + assert.Equal("env:default,resource:yo,service:thing", aggr) + assert.Equal(TagSet{Tag{"env", "default"}, Tag{"resource", "yo"}, Tag{"service", "thing"}}, tgs) +} + +func TestGrainWithExtraTags(t *testing.T) { + srb := NewStatsRawBucket(0, 1e9) + assert := assert.New(t) + + s := pb.Span{Service: "thing", Name: "other", Resource: "yo", Meta: map[string]string{"meta2": "two", "meta1": "ONE"}} + aggr, tgs := assembleGrain(&srb.keyBuf, "default", s.Resource, s.Service, s.Meta) + + assert.Equal("env:default,resource:yo,service:thing,meta1:ONE,meta2:two", aggr) + assert.Equal(TagSet{Tag{"env", "default"}, Tag{"resource", "yo"}, Tag{"service", "thing"}, Tag{"meta1", "ONE"}, Tag{"meta2", "two"}}, tgs) +} diff --git a/pkg/trace/agent/sublayers.go b/pkg/trace/agent/sublayers.go new file mode 100644 index 0000000000000..f15b5065e2e8d --- /dev/null +++ b/pkg/trace/agent/sublayers.go @@ -0,0 +1,268 @@ +package agent + +import ( + "fmt" + "sort" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" +) + +// SublayerValue is just a span-metric placeholder for a given sublayer val +type SublayerValue struct { + Metric string + Tag Tag + Value float64 +} + +// String returns a description of a sublayer value. +func (v SublayerValue) String() string { + if v.Tag.Name == "" && v.Tag.Value == "" { + return fmt.Sprintf("SublayerValue{%q, %v}", v.Metric, v.Value) + } + + return fmt.Sprintf("SublayerValue{%q, %v, %v}", v.Metric, v.Tag, v.Value) +} + +// GoString returns a description of a sublayer value. +func (v SublayerValue) GoString() string { + return v.String() +} + +// ComputeSublayers extracts sublayer values by type and service for a trace +// +// Description of the algorithm, with the following trace as an example: +// +// 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 +// |===|===|===|===|===|===|===|===|===|===|===|===|===|===|===| +// <-1-------------------------------------------------> +// <-2-----------------> <-3---------> +// <-4---------> +// <-5-------------------> +// <--6--------------------> +// <-7-------------> +// 1: service=web-server, type=web, parent=nil +// 2: service=pg, type=db, parent=1 +// 3: service=render, type=web, parent=1 +// 4: service=pg-read, type=db, parent=2 +// 5: service=redis, type=cache, parent=1 +// 6: service=rpc1, type=rpc, parent=1 +// 7: service=alert, type=rpc, parent=6 +// +// Step 1: Find all time intervals to consider (set of start/end time +// of spans): +// +// [0, 10, 15, 20, 50, 60, 70, 80, 110, 120, 130, 150] +// +// Step 2: Map each time intervals to a set of "active" spans. A span +// is considered active for a given time interval if it has no +// direct child span at that time interval. This is done by +// iterating over the spans, iterating over each time +// intervals, and checking if the span has a child running +// during that time interval. If not, it is considered active: +// +// { +// 0: [ 1 ], +// 10: [ 2 ], +// 15: [ 2, 5 ], +// 20: [ 4, 5 ], +// ... +// 110: [ 7 ], +// 120: [ 1, 7 ], +// 130: [ 7 ], +// 150: [], +// } +// +// Step 4: Build a service and type duration mapping by: +// 1. iterating over each time intervals +// 2. computing the time interval duration portion (time +// interval duration / number of active spans) +// 3. iterate over each active span of that time interval +// 4. add to the active span's type and service duration the +// duration portion +// +// { +// web-server: 10, +// render: 15, +// pg: 12.5, +// pg-read: 15, +// redis: 27.5, +// rpc1: 30, +// alert: 40, +// } +// { +// web: 70, +// cache: 55, +// db: 55, +// rpc: 55, +// } +func ComputeSublayers(trace pb.Trace) []SublayerValue { + timestamps := buildTraceTimestamps(trace) + activeSpans := buildTraceActiveSpansMapping(trace, timestamps) + + durationsByService := computeDurationByAttr( + timestamps, activeSpans, func(s *pb.Span) string { return s.Service }, + ) + durationsByType := computeDurationByAttr( + timestamps, activeSpans, func(s *pb.Span) string { return s.Type }, + ) + + // Generate sublayers values + values := make([]SublayerValue, 0, + len(durationsByService)+len(durationsByType)+1, + ) + + for service, duration := range durationsByService { + values = append(values, SublayerValue{ + Metric: "_sublayers.duration.by_service", + Tag: Tag{"sublayer_service", service}, + Value: float64(int64(duration)), + }) + } + + for spanType, duration := range durationsByType { + values = append(values, SublayerValue{ + Metric: "_sublayers.duration.by_type", + Tag: Tag{"sublayer_type", spanType}, + Value: float64(int64(duration)), + }) + } + + values = append(values, SublayerValue{ + Metric: "_sublayers.span_count", + Value: float64(len(trace)), + }) + + return values +} + +// int64Slice is used by buildTraceTimestamps as a sortable slice of +// int64 +type int64Slice []int64 + +func (a int64Slice) Len() int { return len(a) } +func (a int64Slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a int64Slice) Less(i, j int) bool { return a[i] < a[j] } + +// buildTraceTimestamps returns the timestamps of a trace, i.e the set +// of start/end times of each spans +func buildTraceTimestamps(trace pb.Trace) []int64 { + tsSet := make(map[int64]struct{}, 2*len(trace)) + + for _, span := range trace { + start, end := span.Start, span.Start+span.Duration + tsSet[start] = struct{}{} + tsSet[end] = struct{}{} + } + + timestamps := make(int64Slice, 0, len(tsSet)) + for ts := range tsSet { + timestamps = append(timestamps, ts) + } + + sort.Sort(timestamps) + return timestamps +} + +// activeSpansMap is used by buildTraceActiveSpansMapping and is just +// a map with a add function setting the key to the empty slice of no +// entry exists +type activeSpansMap map[int64][]*pb.Span + +func (a activeSpansMap) Add(ts int64, span *pb.Span) { + if _, ok := a[ts]; !ok { + a[ts] = make([]*pb.Span, 0, 1) + } + a[ts] = append(a[ts], span) +} + +// buildTraceActiveSpansMapping returns a mapping from timestamps to +// a set of active spans +func buildTraceActiveSpansMapping(trace pb.Trace, timestamps []int64) map[int64][]*pb.Span { + activeSpans := make(activeSpansMap, len(timestamps)) + + tsToIdx := make(map[int64]int, len(timestamps)) + for i, ts := range timestamps { + tsToIdx[ts] = i + } + + spanChildren := traceutil.ChildrenMap(trace) + for sIdx, span := range trace { + start, end := span.Start, span.Start+span.Duration + for tsIdx := tsToIdx[start]; tsIdx < tsToIdx[end]; tsIdx++ { + ts := timestamps[tsIdx] + + // Do we have one of our child also in the + // current time interval? + hasChild := false + for _, child := range spanChildren[span.SpanID] { + start, end := child.Start, child.Start+child.Duration + if start <= ts && end > ts { + hasChild = true + break + } + } + + if !hasChild { + activeSpans.Add(ts, trace[sIdx]) + } + } + } + + return activeSpans +} + +// attrSelector is used by computeDurationByAttr and is a func +// returning an attribute for a given span +type attrSelector func(*pb.Span) string + +// computeDurationByAttr returns a mapping from an attribute to the +// sum of all weighted duration of spans with that given +// attribute. The attribute is returned by calling selector on each +// spans +func computeDurationByAttr(timestamps []int64, activeSpansByTs activeSpansMap, selector attrSelector) map[string]float64 { + durations := make(map[string]float64) + + for i := 0; i < len(timestamps)-1; i++ { + start := timestamps[i] + end := timestamps[i+1] + + activeSpans := activeSpansByTs[start] + if len(activeSpans) == 0 { + continue + } + + durationPortion := float64(end-start) / float64(len(activeSpans)) + + for _, span := range activeSpans { + key := selector(span) + if key == "" { + continue + } + + if _, ok := durations[key]; !ok { + durations[key] = 0 + } + durations[key] += durationPortion + } + } + + return durations +} + +// SetSublayersOnSpan takes some sublayers and pins them on the given span.Metrics +func SetSublayersOnSpan(span *pb.Span, values []SublayerValue) { + if span.Metrics == nil { + span.Metrics = make(map[string]float64, len(values)) + } + + for _, value := range values { + name := value.Metric + + if value.Tag.Name != "" { + name = name + "." + value.Tag.Name + ":" + value.Tag.Value + } + + span.Metrics[name] = value.Value + } +} diff --git a/pkg/trace/agent/sublayers_test.go b/pkg/trace/agent/sublayers_test.go new file mode 100644 index 0000000000000..44f06f95cb308 --- /dev/null +++ b/pkg/trace/agent/sublayers_test.go @@ -0,0 +1,445 @@ +package agent + +import ( + "sort" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +type sublayerValues []SublayerValue + +func (values sublayerValues) Len() int { + return len(values) +} + +func (values sublayerValues) Swap(i, j int) { + values[i], values[j] = values[j], values[i] +} + +func (values sublayerValues) Less(i, j int) bool { + if values[i].Metric < values[j].Metric { + return true + } else if values[i].Metric > values[j].Metric { + return false + } else { + return values[i].Tag.Value < values[j].Tag.Value + } +} + +func TestComputeSublayers(t *testing.T) { + assert := assert.New(t) + + span := func(id, parentId uint64, service, spanType string, start, duration int64) *pb.Span { + return &pb.Span{ + TraceID: 1, + SpanID: id, + ParentID: parentId, + Service: service, + Type: spanType, + Start: start, + Duration: duration, + } + } + + sublayerValueService := func(service string, value float64) SublayerValue { + return SublayerValue{ + Metric: "_sublayers.duration.by_service", + Tag: Tag{"sublayer_service", service}, + Value: value, + } + } + + sublayerValueType := func(spanType string, value float64) SublayerValue { + return SublayerValue{ + Metric: "_sublayers.duration.by_type", + Tag: Tag{"sublayer_type", spanType}, + Value: value, + } + } + + sublayerValueCount := func(count float64) SublayerValue { + return SublayerValue{ + Metric: "_sublayers.span_count", + Value: count, + } + } + + tests := []struct { + name string + trace pb.Trace + values []SublayerValue + }{ + // Single span + // + // 0 10 20 30 40 50 60 70 80 90 100 + // |===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------> + { + "single span", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 100), + }, + []SublayerValue{ + sublayerValueService("web-server", 100), + sublayerValueType("web", 100), + sublayerValueCount(1), + }, + }, + + // Multiple spans + // + // 0 10 20 30 40 50 60 70 80 90 100 + // |===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------> + // <-2-----> <-4-----> + // <-3-> + { + "multiple spans", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 100), + span(2, 1, "db-server", "db", 10, 20), + span(3, 2, "pgsql", "db", 15, 10), + span(4, 1, "web-server", "template", 40, 20), + }, + []SublayerValue{ + sublayerValueService("db-server", 10), + sublayerValueService("pgsql", 10), + sublayerValueService("web-server", 80), + sublayerValueType("db", 20), + sublayerValueType("template", 20), + sublayerValueType("web", 60), + sublayerValueCount(4), + }, + }, + + // Multiple parallel spans with no multiple service + // active + // + // 0 10 20 30 40 50 60 70 80 90 100 + // |===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------> + // <-2-----> <-5-----> + // <-3-----> <--6----> + // <-4-----> <-7-> + { + "multiple parallel spans no multiple service active", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 100), + span(2, 1, "rpc1", "rpc", 10, 20), + span(3, 1, "rpc1", "rpc", 15, 20), + span(4, 1, "rpc1", "rpc", 20, 20), + span(5, 1, "rpc2", "rpc", 50, 20), + span(6, 1, "rpc2", "rpc", 45, 20), + span(7, 1, "rpc3", "rpc", 80, 10), + }, + []SublayerValue{ + sublayerValueService("rpc1", 30), + sublayerValueService("rpc2", 25), + sublayerValueService("rpc3", 10), + sublayerValueService("web-server", 35), + sublayerValueType("rpc", 65), + sublayerValueType("web", 35), + sublayerValueCount(7), + }, + }, + + // Parallel spans parent not waiting + // + // 0 10 20 30 40 50 60 70 80 90 100 + // |===|===|===|===|===|===|===|===|===|===| + // <-1-----------------> + // <-2----------------> + // <-3-------------> + { + "parallel spans parent not waiting", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 50), + span(2, 1, "rpc1", "rpc", 20, 50), + span(3, 2, "rpc2", "rpc", 60, 40), + }, + []SublayerValue{ + sublayerValueService("rpc1", 40), + sublayerValueService("rpc2", 40), + sublayerValueService("web-server", 20), + sublayerValueType("rpc", 80), + sublayerValueType("web", 20), + sublayerValueCount(3), + }, + }, + + // Multiple parallel spans multiple service active parent not waiting + // + // 0 10 20 30 40 50 60 70 80 90 100 + // |===|===|===|===|===|===|===|===|===|===| + // <-1-----------------> + // <-2-----------------> + // <-3--------------------------------> + // <-4-> + { + "multiple parallel spans multiple service active parent not waiting", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 50), + span(2, 1, "rpc1", "rpc", 20, 50), + span(3, 1, "rpc2", "rpc", 10, 90), + span(4, 1, "rpc3", "rpc", 60, 10), + }, + []SublayerValue{ + sublayerValueService("rpc1", 23), + sublayerValueService("rpc2", 63), + sublayerValueService("rpc3", 3), + sublayerValueService("web-server", 10), + sublayerValueType("rpc", 90), + sublayerValueType("web", 10), + sublayerValueCount(4), + }, + }, + + // + // Mix of everything + // + // 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 + // |===|===|===|===|===|===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------------------> + // <-2-----------------> <-3---------> + // <-4---------> + // <-5-------------------> + // <--6--------------------> + // <-7-------------> + { + "mix of everything", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 130), + span(2, 1, "pg", "db", 10, 50), + span(3, 1, "render", "web", 80, 30), + span(4, 2, "pg-read", "db", 20, 30), + span(5, 1, "redis", "cache", 15, 55), + span(6, 1, "rpc1", "rpc", 60, 60), + span(7, 6, "alert", "rpc", 110, 40), + }, + []SublayerValue{ + sublayerValueService("alert", 35), + sublayerValueService("pg", 12), + sublayerValueService("pg-read", 15), + sublayerValueService("redis", 27), + sublayerValueService("render", 15), + sublayerValueService("rpc1", 30), + sublayerValueService("web-server", 15), + sublayerValueType("cache", 27), + sublayerValueType("db", 27), + sublayerValueType("rpc", 65), + sublayerValueType("web", 30), + sublayerValueCount(7), + }, + }, + } + + for _, test := range tests { + values := ComputeSublayers(test.trace) + sort.Sort(sublayerValues(values)) + + assert.Equal(test.values, values, "test: "+test.name) + } +} + +func TestBuildTraceTimestamps(t *testing.T) { + assert := assert.New(t) + + span := func(id, parentId uint64, service, spanType string, start, duration int64) *pb.Span { + return &pb.Span{ + TraceID: 1, + SpanID: id, + ParentID: parentId, + Service: service, + Type: spanType, + Start: start, + Duration: duration, + } + } + + tests := []struct { + name string + trace pb.Trace + expected []int64 + }{ + // + // 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 + // |===|===|===|===|===|===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------------------> + // <-2-----------------> <-3---------> + // <-4---------> + // <-5-------------------> + // <--6--------------------> + // <-7-------------> + { + "mix of everything", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 130), + span(2, 1, "pg", "db", 10, 50), + span(3, 1, "render", "web", 80, 30), + span(4, 2, "pg-read", "db", 20, 30), + span(5, 1, "redis", "cache", 15, 55), + span(6, 1, "rpc1", "rpc", 60, 60), + span(7, 6, "alert", "rpc", 110, 40), + }, + []int64{0, 10, 15, 20, 50, 60, 70, 80, 110, 120, 130, 150}, + }, + } + + for _, test := range tests { + actual := buildTraceTimestamps(test.trace) + + assert.Equal(test.expected, actual, "test: "+test.name) + } +} + +func TestBuildTraceActiveSpansMapping(t *testing.T) { + assert := assert.New(t) + + span := func(id, parentId uint64, service, spanType string, start, duration int64) *pb.Span { + return &pb.Span{ + TraceID: 1, + SpanID: id, + ParentID: parentId, + Service: service, + Type: spanType, + Start: start, + Duration: duration, + } + } + + tests := []struct { + name string + trace pb.Trace + timestamps []int64 + expected map[int64][]uint64 + }{ + // + // 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 + // |===|===|===|===|===|===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------------------> + // <-2-----------------> <-3---------> + // <-4---------> + // <-5-------------------> + // <--6--------------------> + // <-7-------------> + { + "mix of everything", + pb.Trace{ + span(1, 0, "web-server", "web", 0, 130), + span(2, 1, "pg", "db", 10, 50), + span(3, 1, "render", "web", 80, 30), + span(4, 2, "pg-read", "db", 20, 30), + span(5, 1, "redis", "cache", 15, 55), + span(6, 1, "rpc1", "rpc", 60, 60), + span(7, 6, "alert", "rpc", 110, 40), + }, + []int64{0, 10, 15, 20, 50, 60, 70, 80, 110, 120, 130, 150}, + map[int64][]uint64{ + 0: []uint64{1}, + 10: []uint64{2}, + 15: []uint64{2, 5}, + 20: []uint64{4, 5}, + 50: []uint64{2, 5}, + 60: []uint64{5, 6}, + 70: []uint64{6}, + 80: []uint64{3, 6}, + 110: []uint64{7}, + 120: []uint64{1, 7}, + 130: []uint64{7}, + }, + }, + } + + for _, test := range tests { + actual := buildTraceActiveSpansMapping(test.trace, test.timestamps) + + actualSpanIds := make(map[int64][]uint64, len(actual)) + for ts, spans := range actual { + ids := make([]uint64, 0, len(spans)) + for _, span := range spans { + ids = append(ids, span.SpanID) + } + + actualSpanIds[ts] = ids + } + + assert.Equal(test.expected, actualSpanIds, "test: "+test.name) + } +} + +func TestSetSublayersOnSpan(t *testing.T) { + assert := assert.New(t) + + values := []SublayerValue{ + SublayerValue{ + Metric: "_sublayers.duration.by_service", + Tag: Tag{"sublayer_service", "pgsql"}, + Value: 30.0, + }, + SublayerValue{ + Metric: "_sublayers.duration.by_service", + Tag: Tag{"sublayer_service", "pgsql-read"}, + Value: 20.0, + }, + SublayerValue{ + Metric: "_sublayers.duration.by_type", + Tag: Tag{"sublayer_type", "db"}, + Value: 30.0, + }, + SublayerValue{ + Metric: "_sublayers.span_count", + Value: 2.0, + }, + } + + var span pb.Span + SetSublayersOnSpan(&span, values) + + assert.Equal(map[string]float64{ + "_sublayers.span_count": 2.0, + "_sublayers.duration.by_type.sublayer_type:db": 30.0, + "_sublayers.duration.by_service.sublayer_service:pgsql": 30.0, + "_sublayers.duration.by_service.sublayer_service:pgsql-read": 20.0, + }, span.Metrics) +} + +func BenchmarkComputeSublayers(b *testing.B) { + span := func(id, parentId uint64, service, spanType string, start, duration int64) *pb.Span { + return &pb.Span{ + TraceID: 1, + SpanID: id, + ParentID: parentId, + Service: service, + Type: spanType, + Start: start, + Duration: duration, + } + } + + // + // 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 + // |===|===|===|===|===|===|===|===|===|===|===|===|===|===|===| + // <-1-------------------------------------------------> + // <-2-----------------> <-3---------> + // <-4---------> + // <-5-------------------> + // <--6--------------------> + // <-7-------------> + trace := pb.Trace{ + span(1, 0, "web-server", "web", 0, 130), + span(2, 1, "pg", "db", 10, 50), + span(3, 1, "render", "web", 80, 30), + span(4, 2, "pg-read", "db", 20, 30), + span(5, 1, "redis", "cache", 15, 55), + span(6, 1, "rpc1", "rpc", 60, 60), + span(7, 6, "alert", "rpc", 110, 40), + } + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + ComputeSublayers(trace) + } +} diff --git a/pkg/trace/agent/tags.go b/pkg/trace/agent/tags.go new file mode 100644 index 0000000000000..8be204fadab9c --- /dev/null +++ b/pkg/trace/agent/tags.go @@ -0,0 +1,306 @@ +package agent + +import ( + "bytes" + "sort" + "strings" + "unicode" +) + +const maxTagLength = 200 + +// Tag represents a key / value dimension on traces and stats. +type Tag struct { + Name string `json:"name"` + Value string `json:"value"` +} + +// String returns a string representation of a tag +func (t Tag) String() string { + return t.Name + ":" + t.Value +} + +// SplitTag splits the tag into group and value. If it doesn't have a seperator +// the empty string will be used for the group. +func SplitTag(tag string) (group, value string) { + split := strings.SplitN(tag, ":", 2) + if len(split) == 1 { + return "", split[0] + } + return split[0], split[1] +} + +// NewTagFromString returns a new Tag from a raw string +func NewTagFromString(raw string) Tag { + name, val := SplitTag(raw) + return Tag{name, val} +} + +// TagSet is a combination of given tags, it is equivalent to contexts that we use for metrics. +// Although we choose a different terminology here to avoid confusion, and tag sets do not have +// a notion of activeness over time. A tag can be: +// • one of the fixed ones we defined in the span structure: service, resource and host +// • one of the arbitrary metadata key included in the span (it needs to be turned on manually) +// +// When we track statistics by tag sets, we basically track every tag combination we're interested +// in to create dimensions, for instance: +// • (service) +// • (service, environment) +// • (service, host) +// • (service, resource, environment) +// • (service, resource) +// • .. +type TagSet []Tag + +// NewTagSetFromString returns a new TagSet from a raw string +func NewTagSetFromString(raw string) TagSet { + var tags TagSet + for _, t := range strings.Split(raw, ",") { + tags = append(tags, NewTagFromString(t)) + } + return tags +} + +// TagKey returns a unique key from the string given and the tagset, useful to index stuff on tagsets +func (t TagSet) TagKey(m string) string { + tagStrings := make([]string, len(t)) + for i, tag := range t { + tagStrings[i] = tag.String() + } + sort.Strings(tagStrings) + return m + "|" + strings.Join(tagStrings, ",") +} + +func (t TagSet) Len() int { return len(t) } +func (t TagSet) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t TagSet) Less(i, j int) bool { + if t[i].Name == t[j].Name { + return t[i].Value < t[j].Value + } + return t[i].Name < t[j].Name +} + +// Key returns a string representing a new set of tags. +func (t TagSet) Key() string { + s := make([]string, len(t)) + for i, t := range t { + s[i] = t.String() + } + sort.Strings(s) + return strings.Join(s, ",") +} + +// Get the tag with the particular name +func (t TagSet) Get(name string) Tag { + for _, tag := range t { + if tag.Name == name { + return tag + } + } + return Tag{} +} + +// Unset returns a new tagset without a given value +func (t TagSet) Unset(name string) TagSet { + var j int + var t2 TagSet + for i, tag := range t { + if tag.Name == name { + j = i + 1 + break + } + t2 = append(t2, tag) + } + for i := j; i < len(t); i++ { + t2 = append(t2, t[i]) + } + return t2 +} + +// Match returns a new tag set with only the tags matching the given groups. +func (t TagSet) Match(groups []string) TagSet { + if len(groups) == 0 { + return nil + } + var match []Tag + for _, g := range groups { + tag := t.Get(g) + if tag.Value == "" { + continue + } + match = append(match, tag) + } + ts := TagSet(match) + sort.Sort(ts) + return ts +} + +// HasExactly returns true if we have tags only for the given groups. +func (t TagSet) HasExactly(groups []string) bool { + if len(groups) != len(t) { + return false + } + // FIXME quadratic + for _, g := range groups { + if t.Get(g).Name == "" { + return false + } + } + return true +} + +// MatchFilters returns a tag set of the tags that match certain filters. +// A filter is defined as : "KEY:VAL" where: +// * KEY is a non-empty string +// * VALUE is a string (can be empty) +// A tag {Name:k, Value:v} from the input tag set will match if: +// * KEY==k and VALUE is non-empty and v==VALUE +// * KEY==k and VALUE is empty (don't care about v) +func (t TagSet) MatchFilters(filters []string) TagSet { + // FIXME: ugly ? + filterMap := make(map[string]map[string]struct{}) + + for _, f := range filters { + g, v := SplitTag(f) + m, ok := filterMap[g] + if !ok { + m = make(map[string]struct{}) + filterMap[g] = m + } + + if v != "" { + filterMap[g][v] = struct{}{} + } + } + + matchedFilters := TagSet{} + + for _, tag := range t { + vals, ok := filterMap[tag.Name] + if ok { + if len(vals) == 0 { + matchedFilters = append(matchedFilters, tag) + } else { + _, ok := vals[tag.Value] + if ok { + matchedFilters = append(matchedFilters, tag) + } + } + } + } + return matchedFilters +} + +// MergeTagSets merge two tag sets lazily +func MergeTagSets(t1, t2 TagSet) TagSet { + if t1 == nil { + return t2 + } + if t2 == nil { + return t1 + } + t := append(t1, t2...) + + if len(t) < 2 { + return t + } + + // sorting is actually expensive so skip it if we can + if !sort.IsSorted(t) { + sort.Sort(t) + } + + last := t[0] + idx := 1 + for i := 1; i < len(t); i++ { + if t[i].Name != last.Name || t[i].Value != last.Value { + last = t[i] + t[idx] = last + idx++ + + } + } + return t[:idx] +} + +// TagGroup will return the tag group from the given string. For example, +// "host:abc" => "host" +func TagGroup(tag string) string { + for i, c := range tag { + if c == ':' { + return tag[0:i] + } + } + return "" +} + +// FilterTags will return the tags that have the given group. +func FilterTags(tags, groups []string) []string { + var out []string + for _, t := range tags { + tg := TagGroup(t) + for _, g := range groups { + if g == tg { + out = append(out, t) + break + } + } + } + return out +} + +// NormalizeTag applies some normalization to ensure the tags match the +// backend requirements +// taken from dd-go.model.NormalizeTag +func NormalizeTag(tag string) string { + // unless you just throw out unicode, this is already as fast as it gets + + buf := bytes.NewBuffer(make([]byte, 0, 2*len(tag))) + lastWasUnderscore := false + + for _, c := range tag { + // fast path for len check + if buf.Len() >= maxTagLength { + break + } + // fast path for ascii alphabetic chars + switch { + case c >= 'a' && c <= 'z': + buf.WriteRune(c) + lastWasUnderscore = false + continue + case c >= 'A' && c <= 'Z': + c -= 'A' - 'a' + buf.WriteRune(c) + lastWasUnderscore = false + continue + } + + c = unicode.ToLower(c) + switch { + // handle always valid cases + case unicode.IsLetter(c) || c == ':': + buf.WriteRune(c) + lastWasUnderscore = false + // skip any characters that can't start the string + case buf.Len() == 0: + continue + // handle valid characters that can't start the string. + case unicode.IsDigit(c) || c == '.' || c == '/' || c == '-': + buf.WriteRune(c) + lastWasUnderscore = false + // convert anything else to underscores (including underscores), but only allow one in a row. + case !lastWasUnderscore: + buf.WriteRune('_') + lastWasUnderscore = true + } + } + + // strip trailing underscores + if lastWasUnderscore { + b := buf.Bytes() + return string(b[:len(b)-1]) + } + + return buf.String() +} diff --git a/pkg/trace/agent/tags_test.go b/pkg/trace/agent/tags_test.go new file mode 100644 index 0000000000000..927921b8e9089 --- /dev/null +++ b/pkg/trace/agent/tags_test.go @@ -0,0 +1,147 @@ +package agent + +import ( + "sort" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGroup(t *testing.T) { + cases := map[string]string{ + "a:1": "a", + "a": "", + "a:1:1": "a", + "abc:2": "abc", + } + + assert := assert.New(t) + for in, out := range cases { + actual := TagGroup(in) + assert.Equal(out, actual) + } +} + +func TestSort(t *testing.T) { + t1 := NewTagSetFromString("a:2,a:1,a:3") + t2 := NewTagSetFromString("a:1,a:2,a:3") + sort.Sort(t1) + assert.Equal(t, t1, t2) + + // trick: service MaxResourceLen { + s.Resource = s.Resource[:MaxResourceLen] + log.Debugf("span.truncate: truncated `Resource` (max %d chars): %s", MaxResourceLen, s.Resource) + } + + // Error - Nothing to do + // Optional data, Meta & Metrics can be nil + // Soft fail on those + for k, v := range s.Meta { + modified := false + + if len(k) > MaxMetaKeyLen { + log.Debugf("span.truncate: truncating `Meta` key (max %d chars): %s", MaxMetaKeyLen, k) + delete(s.Meta, k) + k = k[:MaxMetaKeyLen] + "..." + modified = true + } + + if len(v) > MaxMetaValLen { + v = v[:MaxMetaValLen] + "..." + modified = true + } + + if modified { + s.Meta[k] = v + } + } + + for k, v := range s.Metrics { + if len(k) > MaxMetricsKeyLen { + log.Debugf("span.truncate: truncating `Metrics` key (max %d chars): %s", MaxMetricsKeyLen, k) + delete(s.Metrics, k) + k = k[:MaxMetricsKeyLen] + "..." + + s.Metrics[k] = v + } + } +} diff --git a/pkg/trace/agent/truncator_test.go b/pkg/trace/agent/truncator_test.go new file mode 100644 index 0000000000000..e10c1eab47adc --- /dev/null +++ b/pkg/trace/agent/truncator_test.go @@ -0,0 +1,66 @@ +package agent + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTruncateResourcePassThru(t *testing.T) { + s := testSpan() + before := s.Resource + Truncate(s) + assert.Equal(t, before, s.Resource) +} + +func TestTruncateLongResource(t *testing.T) { + s := testSpan() + s.Resource = strings.Repeat("TOOLONG", 5000) + Truncate(s) + assert.Equal(t, 5000, len(s.Resource)) +} + +func TestTruncateMetricsPassThru(t *testing.T) { + s := testSpan() + before := s.Metrics + Truncate(s) + assert.Equal(t, before, s.Metrics) +} + +func TestTruncateMetricsKeyTooLong(t *testing.T) { + s := testSpan() + key := strings.Repeat("TOOLONG", 1000) + s.Metrics[key] = 42 + Truncate(s) + for k := range s.Metrics { + assert.True(t, len(k) < MaxMetricsKeyLen+4) + } +} + +func TestTruncateMetaPassThru(t *testing.T) { + s := testSpan() + before := s.Meta + Truncate(s) + assert.Equal(t, before, s.Meta) +} + +func TestTruncateMetaKeyTooLong(t *testing.T) { + s := testSpan() + key := strings.Repeat("TOOLONG", 1000) + s.Meta[key] = "foo" + Truncate(s) + for k := range s.Meta { + assert.True(t, len(k) < MaxMetaKeyLen+4) + } +} + +func TestTruncateMetaValueTooLong(t *testing.T) { + s := testSpan() + val := strings.Repeat("TOOLONG", 5000) + s.Meta["foo"] = val + Truncate(s) + for _, v := range s.Meta { + assert.True(t, len(v) < MaxMetaValLen+4) + } +} diff --git a/pkg/trace/agent/weighted_span.go b/pkg/trace/agent/weighted_span.go new file mode 100644 index 0000000000000..7aa437571d1dd --- /dev/null +++ b/pkg/trace/agent/weighted_span.go @@ -0,0 +1,34 @@ +package agent + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" +) + +// WeightedSpan extends Span to contain weights required by the Concentrator. +type WeightedSpan struct { + Weight float64 // Span weight. Similar to the trace root.Weight(). + TopLevel bool // Is this span a service top-level or not. Similar to span.TopLevel(). + + *pb.Span +} + +// WeightedTrace is a slice of WeightedSpan pointers. +type WeightedTrace []*WeightedSpan + +// NewWeightedTrace returns a weighted trace, with coefficient required by the concentrator. +func NewWeightedTrace(trace pb.Trace, root *pb.Span) WeightedTrace { + wt := make(WeightedTrace, len(trace)) + + weight := sampler.Weight(root) + + for i := range trace { + wt[i] = &WeightedSpan{ + Span: trace[i], + Weight: weight, + TopLevel: traceutil.HasTopLevel(trace[i]), + } + } + return wt +} diff --git a/pkg/trace/api/api.go b/pkg/trace/api/api.go new file mode 100644 index 0000000000000..6d46e5cf8b557 --- /dev/null +++ b/pkg/trace/api/api.go @@ -0,0 +1,425 @@ +package api + +import ( + "context" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "net/http" + "sort" + "strings" + "sync/atomic" + "time" + + log "github.com/cihub/seelog" + "github.com/tinylib/msgp/msgp" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/osutil" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +const ( + maxRequestBodyLength = 10 * 1024 * 1024 + tagTraceHandler = "handler:traces" + tagServiceHandler = "handler:services" +) + +// Version is a dumb way to version our collector handlers +type Version string + +const ( + // v01 DEPRECATED, FIXME[1.x] + // Traces: JSON, slice of spans + // Services: JSON, map[string]map[string][string] + v01 Version = "v0.1" + // v02 DEPRECATED, FIXME[1.x] + // Traces: JSON, slice of traces + // Services: JSON, map[string]map[string][string] + v02 Version = "v0.2" + // v03 + // Traces: msgpack/JSON (Content-Type) slice of traces + // Services: msgpack/JSON, map[string]map[string][string] + v03 Version = "v0.3" + // v04 + // Traces: msgpack/JSON (Content-Type) slice of traces + returns service sampling ratios + // Services: msgpack/JSON, map[string]map[string][string] + v04 Version = "v0.4" +) + +// HTTPReceiver is a collector that uses HTTP protocol and just holds +// a chan where the spans received are sent one by one +type HTTPReceiver struct { + Stats *info.ReceiverStats + PreSampler *sampler.PreSampler + Out chan pb.Trace + + services chan pb.ServicesMetadata + conf *config.AgentConfig + dynConf *sampler.DynamicConfig + server *http.Server + + maxRequestBodyLength int64 + debug bool +} + +// NewHTTPReceiver returns a pointer to a new HTTPReceiver +func NewHTTPReceiver( + conf *config.AgentConfig, dynConf *sampler.DynamicConfig, out chan pb.Trace, services chan pb.ServicesMetadata, +) *HTTPReceiver { + // use buffered channels so that handlers are not waiting on downstream processing + return &HTTPReceiver{ + Stats: info.NewReceiverStats(), + PreSampler: sampler.NewPreSampler(), + Out: out, + + conf: conf, + dynConf: dynConf, + services: services, + + maxRequestBodyLength: maxRequestBodyLength, + debug: strings.ToLower(conf.LogLevel) == "debug", + } +} + +// Run starts doing the HTTP server and is ready to receive traces +func (r *HTTPReceiver) Run() { + // FIXME[1.x]: remove all those legacy endpoints + code that goes with it + http.HandleFunc("/spans", r.httpHandleWithVersion(v01, r.handleTraces)) + http.HandleFunc("/services", r.httpHandleWithVersion(v01, r.handleServices)) + http.HandleFunc("/v0.1/spans", r.httpHandleWithVersion(v01, r.handleTraces)) + http.HandleFunc("/v0.1/services", r.httpHandleWithVersion(v01, r.handleServices)) + http.HandleFunc("/v0.2/traces", r.httpHandleWithVersion(v02, r.handleTraces)) + http.HandleFunc("/v0.2/services", r.httpHandleWithVersion(v02, r.handleServices)) + http.HandleFunc("/v0.3/traces", r.httpHandleWithVersion(v03, r.handleTraces)) + http.HandleFunc("/v0.3/services", r.httpHandleWithVersion(v03, r.handleServices)) + + // current collector API + http.HandleFunc("/v0.4/traces", r.httpHandleWithVersion(v04, r.handleTraces)) + http.HandleFunc("/v0.4/services", r.httpHandleWithVersion(v04, r.handleServices)) + + // expvar implicitely publishes "/debug/vars" on the same port + addr := fmt.Sprintf("%s:%d", r.conf.ReceiverHost, r.conf.ReceiverPort) + if err := r.Listen(addr, ""); err != nil { + osutil.Exitf("%v", err) + } + + go r.PreSampler.Run() + + go func() { + defer watchdog.LogOnPanic() + r.logStats() + }() +} + +// Listen creates a new HTTP server listening on the provided address. +func (r *HTTPReceiver) Listen(addr, logExtra string) error { + listener, err := net.Listen("tcp", addr) + if err != nil { + return fmt.Errorf("cannot listen on %s: %v", addr, err) + } + + ln, err := newRateLimitedListener(listener, r.conf.ConnectionLimit) + if err != nil { + return fmt.Errorf("cannot create listener: %v", err) + } + timeout := 5 * time.Second + if r.conf.ReceiverTimeout > 0 { + timeout = time.Duration(r.conf.ReceiverTimeout) * time.Second + } + r.server = &http.Server{ + ReadTimeout: timeout, + WriteTimeout: timeout, + } + log.Infof("listening for traces at http://%s%s", addr, logExtra) + + go func() { + defer watchdog.LogOnPanic() + ln.Refresh(r.conf.ConnectionLimit) + }() + go func() { + defer watchdog.LogOnPanic() + r.server.Serve(ln) + }() + + return nil +} + +// Stop stops the receiver and shuts down the HTTP server. +func (r *HTTPReceiver) Stop() error { + expiry := time.Now().Add(20 * time.Second) // give it 20 seconds + ctx, _ := context.WithDeadline(context.Background(), expiry) + return r.server.Shutdown(ctx) +} + +func (r *HTTPReceiver) httpHandle(fn http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, req *http.Request) { + req.Body = NewLimitedReader(req.Body, r.maxRequestBodyLength) + defer req.Body.Close() + + fn(w, req) + } +} + +func (r *HTTPReceiver) httpHandleWithVersion(v Version, f func(Version, http.ResponseWriter, *http.Request)) http.HandlerFunc { + return r.httpHandle(func(w http.ResponseWriter, req *http.Request) { + contentType := req.Header.Get("Content-Type") + if contentType == "application/msgpack" && (v == v01 || v == v02) { + // msgpack is only supported for versions 0.3 + log.Errorf("rejecting client request, unsupported media type %q", contentType) + HTTPFormatError([]string{tagTraceHandler, fmt.Sprintf("v:%s", v)}, w) + return + } + + f(v, w, req) + }) +} + +func (r *HTTPReceiver) replyTraces(v Version, w http.ResponseWriter) { + switch v { + case v01: + fallthrough + case v02: + fallthrough + case v03: + // Simple response, simply acknowledge with "OK" + HTTPOK(w) + case v04: + // Return the recommended sampling rate for each service as a JSON. + HTTPRateByService(w, r.dynConf) + } +} + +// handleTraces knows how to handle a bunch of traces +func (r *HTTPReceiver) handleTraces(v Version, w http.ResponseWriter, req *http.Request) { + if !r.PreSampler.Sample(req) { + io.Copy(ioutil.Discard, req.Body) + HTTPOK(w) + return + } + + traces, ok := getTraces(v, w, req) + if !ok { + return + } + + // We successfuly decoded the payload + r.replyTraces(v, w) + + // We parse the tags from the header + tags := info.Tags{ + Lang: req.Header.Get("Datadog-Meta-Lang"), + LangVersion: req.Header.Get("Datadog-Meta-Lang-Version"), + Interpreter: req.Header.Get("Datadog-Meta-Lang-Interpreter"), + TracerVersion: req.Header.Get("Datadog-Meta-Tracer-Version"), + } + + // We get the address of the struct holding the stats associated to the tags + ts := r.Stats.GetTagStats(tags) + + bytesRead := req.Body.(*LimitedReader).Count + if bytesRead > 0 { + atomic.AddInt64(&ts.TracesBytes, int64(bytesRead)) + } + + // normalize data + for _, trace := range traces { + spans := len(trace) + + atomic.AddInt64(&ts.TracesReceived, 1) + atomic.AddInt64(&ts.SpansReceived, int64(spans)) + + err := agent.NormalizeTrace(trace) + if err != nil { + atomic.AddInt64(&ts.TracesDropped, 1) + atomic.AddInt64(&ts.SpansDropped, int64(spans)) + + errorMsg := fmt.Sprintf("dropping trace reason: %s (debug for more info), %v", err, trace) + + // avoid truncation in DEBUG mode + if len(errorMsg) > 150 && !r.debug { + errorMsg = errorMsg[:150] + "..." + } + log.Errorf(errorMsg) + } else { + select { + case r.Out <- trace: + // if our downstream consumer is slow, we drop the trace on the floor + // this is a safety net against us using too much memory + // when clients flood us + default: + atomic.AddInt64(&ts.TracesDropped, 1) + atomic.AddInt64(&ts.SpansDropped, int64(spans)) + + log.Errorf("dropping trace reason: rate-limited") + } + } + } +} + +// handleServices handle a request with a list of several services +func (r *HTTPReceiver) handleServices(v Version, w http.ResponseWriter, req *http.Request) { + var servicesMeta pb.ServicesMetadata + + contentType := req.Header.Get("Content-Type") + if err := decodeReceiverPayload(req.Body, &servicesMeta, v, contentType); err != nil { + log.Errorf("cannot decode %s services payload: %v", v, err) + HTTPDecodingError(err, []string{tagServiceHandler, fmt.Sprintf("v:%s", v)}, w) + return + } + + HTTPOK(w) + + // We parse the tags from the header + tags := info.Tags{ + Lang: req.Header.Get("Datadog-Meta-Lang"), + LangVersion: req.Header.Get("Datadog-Meta-Lang-Version"), + Interpreter: req.Header.Get("Datadog-Meta-Lang-Interpreter"), + TracerVersion: req.Header.Get("Datadog-Meta-Tracer-Version"), + } + + // We get the address of the struct holding the stats associated to the tags + ts := r.Stats.GetTagStats(tags) + + atomic.AddInt64(&ts.ServicesReceived, int64(len(servicesMeta))) + + bytesRead := req.Body.(*LimitedReader).Count + if bytesRead > 0 { + atomic.AddInt64(&ts.ServicesBytes, int64(bytesRead)) + } + + r.services <- servicesMeta +} + +// logStats periodically submits stats about the receiver to statsd +func (r *HTTPReceiver) logStats() { + var lastLog time.Time + accStats := info.NewReceiverStats() + + for now := range time.Tick(10 * time.Second) { + metrics.Gauge("datadog.trace_agent.heartbeat", 1, nil, 1) + + // We update accStats with the new stats we collected + accStats.Acc(r.Stats) + + // Publish the stats accumulated during the last flush + r.Stats.Publish() + + // We reset the stats accumulated during the last 10s. + r.Stats.Reset() + + if now.Sub(lastLog) >= time.Minute { + // We expose the stats accumulated to expvar + info.UpdateReceiverStats(accStats) + + for _, logStr := range accStats.Strings() { + log.Info(logStr) + } + + // We reset the stats accumulated during the last minute + accStats.Reset() + lastLog = now + + // Also publish rates by service (they are updated by receiver) + rates := r.dynConf.RateByService.GetAll() + info.UpdateRateByService(rates) + } + } +} + +// Languages returns the list of the languages used in the traces the agent receives. +func (r *HTTPReceiver) Languages() string { + // We need to use this map because we can have several tags for a same language. + langs := make(map[string]bool) + str := []string{} + + r.Stats.RLock() + for tags := range r.Stats.Stats { + if _, ok := langs[tags.Lang]; !ok { + str = append(str, tags.Lang) + langs[tags.Lang] = true + } + } + r.Stats.RUnlock() + + sort.Strings(str) + return strings.Join(str, "|") +} + +func getTraces(v Version, w http.ResponseWriter, req *http.Request) (pb.Traces, bool) { + var traces pb.Traces + contentType := req.Header.Get("Content-Type") + + switch v { + case v01: + // We cannot use decodeReceiverPayload because []model.Span does not + // implement msgp.Decodable. This hack can be removed once we + // drop v01 support. + if contentType != "application/json" && contentType != "text/json" && contentType != "" { + log.Errorf("rejecting client request, unsupported media type %q", contentType) + HTTPFormatError([]string{tagTraceHandler, fmt.Sprintf("v:%s", v)}, w) + return nil, false + } + + // in v01 we actually get spans that we have to transform in traces + var spans []pb.Span + if err := json.NewDecoder(req.Body).Decode(&spans); err != nil { + log.Errorf("cannot decode %s traces payload: %v", v, err) + HTTPDecodingError(err, []string{tagTraceHandler, fmt.Sprintf("v:%s", v)}, w) + return nil, false + } + traces = tracesFromSpans(spans) + case v02: + fallthrough + case v03: + fallthrough + case v04: + if err := decodeReceiverPayload(req.Body, &traces, v, contentType); err != nil { + log.Errorf("cannot decode %s traces payload: %v", v, err) + HTTPDecodingError(err, []string{tagTraceHandler, fmt.Sprintf("v:%s", v)}, w) + return nil, false + } + default: + HTTPEndpointNotSupported([]string{tagTraceHandler, fmt.Sprintf("v:%s", v)}, w) + return nil, false + } + + return traces, true +} + +func decodeReceiverPayload(r io.Reader, dest msgp.Decodable, v Version, contentType string) error { + switch contentType { + case "application/msgpack": + return msgp.Decode(r, dest) + + case "application/json": + fallthrough + case "text/json": + fallthrough + case "": + return json.NewDecoder(r).Decode(dest) + + default: + panic(fmt.Sprintf("unhandled content type %q", contentType)) + } +} + +func tracesFromSpans(spans []pb.Span) pb.Traces { + traces := pb.Traces{} + byID := make(map[uint64][]*pb.Span) + for _, s := range spans { + byID[s.TraceID] = append(byID[s.TraceID], &s) + } + for _, t := range byID { + traces = append(traces, t) + } + + return traces +} diff --git a/pkg/trace/api/api_test.go b/pkg/trace/api/api_test.go new file mode 100644 index 0000000000000..9d9563f502a7c --- /dev/null +++ b/pkg/trace/api/api_test.go @@ -0,0 +1,718 @@ +package api + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net/http" + "net/http/httptest" + "strconv" + "sync" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + "github.com/stretchr/testify/assert" + "github.com/tinylib/msgp/msgp" +) + +// Traces shouldn't come from more than 5 different sources +var langs = []string{"python", "ruby", "go", "java", "C#"} + +// headerFields is a map used to decode the header metas +var headerFields = map[string]string{ + "lang": "Datadog-Meta-Lang", + "lang_version": "Datadog-Meta-Lang-Version", + "interpreter": "Datadog-Meta-Lang-Interpreter", + "tracer_version": "Datadog-Meta-Tracer-Version", +} + +func newTestReceiverFromConfig(conf *config.AgentConfig) *HTTPReceiver { + dynConf := sampler.NewDynamicConfig("none") + + rawTraceChan := make(chan pb.Trace, 5000) + serviceChan := make(chan pb.ServicesMetadata, 50) + receiver := NewHTTPReceiver(conf, dynConf, rawTraceChan, serviceChan) + + return receiver +} + +func newTestReceiverConfig() *config.AgentConfig { + conf := config.New() + conf.Endpoints[0].APIKey = "test" + + return conf +} + +func TestReceiverRequestBodyLength(t *testing.T) { + assert := assert.New(t) + + // save the global mux aside, we don't want to break other tests + defaultMux := http.DefaultServeMux + http.DefaultServeMux = http.NewServeMux() + + conf := newTestReceiverConfig() + receiver := newTestReceiverFromConfig(conf) + receiver.maxRequestBodyLength = 2 + go receiver.Run() + + defer func() { + receiver.Stop() + http.DefaultServeMux = defaultMux + }() + + url := fmt.Sprintf("http://%s:%d/v0.4/traces", + conf.ReceiverHost, conf.ReceiverPort) + + // Before going further, make sure receiver is started + // since it's running in another goroutine + for i := 0; i < 10; i++ { + client := &http.Client{} + + body := bytes.NewBufferString("[]") + req, err := http.NewRequest("POST", url, body) + assert.Nil(err) + + resp, err := client.Do(req) + if err == nil && resp.StatusCode == http.StatusOK { + break + } + time.Sleep(10 * time.Millisecond) + } + + testBody := func(expectedStatus int, bodyData string) { + client := &http.Client{} + + body := bytes.NewBufferString(bodyData) + req, err := http.NewRequest("POST", url, body) + assert.Nil(err) + + resp, err := client.Do(req) + assert.Nil(err) + assert.Equal(expectedStatus, resp.StatusCode) + } + + testBody(http.StatusOK, "[]") + testBody(http.StatusRequestEntityTooLarge, " []") +} + +func TestLegacyReceiver(t *testing.T) { + // testing traces without content-type in agent endpoints, it should use JSON decoding + assert := assert.New(t) + conf := newTestReceiverConfig() + testCases := []struct { + name string + r *HTTPReceiver + apiVersion Version + contentType string + traces pb.Trace + }{ + {"v01 with empty content-type", newTestReceiverFromConfig(conf), v01, "", pb.Trace{testutil.GetTestSpan()}}, + {"v01 with application/json", newTestReceiverFromConfig(conf), v01, "application/json", pb.Trace{testutil.GetTestSpan()}}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf(tc.name), func(t *testing.T) { + // start testing server + server := httptest.NewServer( + http.HandlerFunc(tc.r.httpHandleWithVersion(tc.apiVersion, tc.r.handleTraces)), + ) + + // send traces to that endpoint without a content-type + data, err := json.Marshal(tc.traces) + assert.Nil(err) + req, err := http.NewRequest("POST", server.URL, bytes.NewBuffer(data)) + assert.Nil(err) + req.Header.Set("Content-Type", tc.contentType) + + client := &http.Client{} + resp, err := client.Do(req) + assert.Nil(err) + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.Out: + assert.Len(rt, 1) + span := rt[0] + assert.Equal(uint64(42), span.TraceID) + assert.Equal(uint64(52), span.SpanID) + assert.Equal("fennel_is_amazing", span.Service) + assert.Equal("something_that_should_be_a_metric", span.Name) + assert.Equal("NOT touched because it is going to be hashed", span.Resource) + assert.Equal("192.168.0.1", span.Meta["http.host"]) + assert.Equal(41.99, span.Metrics["http.monitor"]) + default: + t.Fatalf("no data received") + } + + resp.Body.Close() + server.Close() + }) + } +} + +func TestReceiverJSONDecoder(t *testing.T) { + // testing traces without content-type in agent endpoints, it should use JSON decoding + assert := assert.New(t) + conf := newTestReceiverConfig() + testCases := []struct { + name string + r *HTTPReceiver + apiVersion Version + contentType string + traces []pb.Trace + }{ + {"v02 with empty content-type", newTestReceiverFromConfig(conf), v02, "", testutil.GetTestTrace(1, 1, false)}, + {"v03 with empty content-type", newTestReceiverFromConfig(conf), v03, "", testutil.GetTestTrace(1, 1, false)}, + {"v04 with empty content-type", newTestReceiverFromConfig(conf), v04, "", testutil.GetTestTrace(1, 1, false)}, + {"v02 with application/json", newTestReceiverFromConfig(conf), v02, "application/json", testutil.GetTestTrace(1, 1, false)}, + {"v03 with application/json", newTestReceiverFromConfig(conf), v03, "application/json", testutil.GetTestTrace(1, 1, false)}, + {"v04 with application/json", newTestReceiverFromConfig(conf), v04, "application/json", testutil.GetTestTrace(1, 1, false)}, + {"v02 with text/json", newTestReceiverFromConfig(conf), v02, "text/json", testutil.GetTestTrace(1, 1, false)}, + {"v03 with text/json", newTestReceiverFromConfig(conf), v03, "text/json", testutil.GetTestTrace(1, 1, false)}, + {"v04 with text/json", newTestReceiverFromConfig(conf), v04, "text/json", testutil.GetTestTrace(1, 1, false)}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf(tc.name), func(t *testing.T) { + // start testing server + server := httptest.NewServer( + http.HandlerFunc(tc.r.httpHandleWithVersion(tc.apiVersion, tc.r.handleTraces)), + ) + + // send traces to that endpoint without a content-type + data, err := json.Marshal(tc.traces) + assert.Nil(err) + req, err := http.NewRequest("POST", server.URL, bytes.NewBuffer(data)) + assert.Nil(err) + req.Header.Set("Content-Type", tc.contentType) + + client := &http.Client{} + resp, err := client.Do(req) + assert.Nil(err) + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.Out: + assert.Len(rt, 1) + span := rt[0] + assert.Equal(uint64(42), span.TraceID) + assert.Equal(uint64(52), span.SpanID) + assert.Equal("fennel_is_amazing", span.Service) + assert.Equal("something_that_should_be_a_metric", span.Name) + assert.Equal("NOT touched because it is going to be hashed", span.Resource) + assert.Equal("192.168.0.1", span.Meta["http.host"]) + assert.Equal(41.99, span.Metrics["http.monitor"]) + default: + t.Fatalf("no data received") + } + + resp.Body.Close() + server.Close() + }) + } +} + +func TestReceiverMsgpackDecoder(t *testing.T) { + // testing traces without content-type in agent endpoints, it should use Msgpack decoding + // or it should raise a 415 Unsupported media type + assert := assert.New(t) + conf := newTestReceiverConfig() + testCases := []struct { + name string + r *HTTPReceiver + apiVersion Version + contentType string + traces pb.Traces + }{ + {"v01 with application/msgpack", newTestReceiverFromConfig(conf), v01, "application/msgpack", testutil.GetTestTrace(1, 1, false)}, + {"v02 with application/msgpack", newTestReceiverFromConfig(conf), v02, "application/msgpack", testutil.GetTestTrace(1, 1, false)}, + {"v03 with application/msgpack", newTestReceiverFromConfig(conf), v03, "application/msgpack", testutil.GetTestTrace(1, 1, false)}, + {"v04 with application/msgpack", newTestReceiverFromConfig(conf), v04, "application/msgpack", testutil.GetTestTrace(1, 1, false)}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf(tc.name), func(t *testing.T) { + // start testing server + server := httptest.NewServer( + http.HandlerFunc(tc.r.httpHandleWithVersion(tc.apiVersion, tc.r.handleTraces)), + ) + + // send traces to that endpoint using the msgpack content-type + var buf bytes.Buffer + err := msgp.Encode(&buf, tc.traces) + assert.Nil(err) + req, err := http.NewRequest("POST", server.URL, &buf) + assert.Nil(err) + req.Header.Set("Content-Type", tc.contentType) + + client := &http.Client{} + resp, err := client.Do(req) + assert.Nil(err) + + switch tc.apiVersion { + case v01: + assert.Equal(415, resp.StatusCode) + case v02: + assert.Equal(415, resp.StatusCode) + case v03: + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.Out: + assert.Len(rt, 1) + span := rt[0] + assert.Equal(uint64(42), span.TraceID) + assert.Equal(uint64(52), span.SpanID) + assert.Equal("fennel_is_amazing", span.Service) + assert.Equal("something_that_should_be_a_metric", span.Name) + assert.Equal("NOT touched because it is going to be hashed", span.Resource) + assert.Equal("192.168.0.1", span.Meta["http.host"]) + assert.Equal(41.99, span.Metrics["http.monitor"]) + default: + t.Fatalf("no data received") + } + + body, err := ioutil.ReadAll(resp.Body) + assert.Nil(err) + assert.Equal("OK\n", string(body)) + case v04: + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.Out: + assert.Len(rt, 1) + span := rt[0] + assert.Equal(uint64(42), span.TraceID) + assert.Equal(uint64(52), span.SpanID) + assert.Equal("fennel_is_amazing", span.Service) + assert.Equal("something_that_should_be_a_metric", span.Name) + assert.Equal("NOT touched because it is going to be hashed", span.Resource) + assert.Equal("192.168.0.1", span.Meta["http.host"]) + assert.Equal(41.99, span.Metrics["http.monitor"]) + default: + t.Fatalf("no data received") + } + + body, err := ioutil.ReadAll(resp.Body) + assert.Nil(err) + var tr traceResponse + err = json.Unmarshal(body, &tr) + assert.Nil(err, "the answer should be a valid JSON") + } + + resp.Body.Close() + server.Close() + }) + } +} + +func TestReceiverServiceJSONDecoder(t *testing.T) { + // testing traces without content-type in agent endpoints, it should use JSON decoding + assert := assert.New(t) + conf := newTestReceiverConfig() + testCases := []struct { + name string + r *HTTPReceiver + apiVersion Version + contentType string + }{ + {"v01 with empty content-type", newTestReceiverFromConfig(conf), v01, ""}, + {"v02 with empty content-type", newTestReceiverFromConfig(conf), v02, ""}, + {"v03 with empty content-type", newTestReceiverFromConfig(conf), v03, ""}, + {"v04 with empty content-type", newTestReceiverFromConfig(conf), v04, ""}, + {"v01 with application/json", newTestReceiverFromConfig(conf), v01, "application/json"}, + {"v02 with application/json", newTestReceiverFromConfig(conf), v02, "application/json"}, + {"v03 with application/json", newTestReceiverFromConfig(conf), v03, "application/json"}, + {"v04 with application/json", newTestReceiverFromConfig(conf), v04, "application/json"}, + {"v01 with text/json", newTestReceiverFromConfig(conf), v01, "text/json"}, + {"v02 with text/json", newTestReceiverFromConfig(conf), v02, "text/json"}, + {"v03 with text/json", newTestReceiverFromConfig(conf), v03, "text/json"}, + {"v04 with text/json", newTestReceiverFromConfig(conf), v04, "text/json"}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf(tc.name), func(t *testing.T) { + // start testing server + server := httptest.NewServer( + http.HandlerFunc(tc.r.httpHandleWithVersion(tc.apiVersion, tc.r.handleServices)), + ) + + // send service to that endpoint using the JSON content-type + services := pb.ServicesMetadata{ + "backend": map[string]string{ + "app": "django", + "app_type": "web", + }, + "database": map[string]string{ + "app": "postgres", + "app_type": "db", + }, + } + + data, err := json.Marshal(services) + assert.Nil(err) + req, err := http.NewRequest("POST", server.URL, bytes.NewBuffer(data)) + assert.Nil(err) + req.Header.Set("Content-Type", tc.contentType) + + client := &http.Client{} + resp, err := client.Do(req) + assert.Nil(err) + + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.services: + assert.Len(rt, 2) + assert.Equal(rt["backend"]["app"], "django") + assert.Equal(rt["backend"]["app_type"], "web") + assert.Equal(rt["database"]["app"], "postgres") + assert.Equal(rt["database"]["app_type"], "db") + default: + t.Fatalf("no data received") + } + + resp.Body.Close() + server.Close() + }) + } +} + +func TestReceiverServiceMsgpackDecoder(t *testing.T) { + // testing traces without content-type in agent endpoints, it should use Msgpack decoding + // or it should raise a 415 Unsupported media type + assert := assert.New(t) + conf := newTestReceiverConfig() + testCases := []struct { + name string + r *HTTPReceiver + apiVersion Version + contentType string + }{ + {"v01 with application/msgpack", newTestReceiverFromConfig(conf), v01, "application/msgpack"}, + {"v02 with application/msgpack", newTestReceiverFromConfig(conf), v02, "application/msgpack"}, + {"v03 with application/msgpack", newTestReceiverFromConfig(conf), v03, "application/msgpack"}, + {"v04 with application/msgpack", newTestReceiverFromConfig(conf), v04, "application/msgpack"}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf(tc.name), func(t *testing.T) { + // start testing server + server := httptest.NewServer( + http.HandlerFunc(tc.r.httpHandleWithVersion(tc.apiVersion, tc.r.handleServices)), + ) + + // send service to that endpoint using the JSON content-type + services := pb.ServicesMetadata{ + "backend": map[string]string{ + "app": "django", + "app_type": "web", + }, + "database": map[string]string{ + "app": "postgres", + "app_type": "db", + }, + } + + // send traces to that endpoint using the Msgpack content-type + var buf bytes.Buffer + err := msgp.Encode(&buf, services) + assert.Nil(err) + req, err := http.NewRequest("POST", server.URL, &buf) + assert.Nil(err) + req.Header.Set("Content-Type", tc.contentType) + + client := &http.Client{} + resp, err := client.Do(req) + assert.Nil(err) + + switch tc.apiVersion { + case v01: + assert.Equal(415, resp.StatusCode) + case v02: + assert.Equal(415, resp.StatusCode) + case v03: + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.services: + assert.Len(rt, 2) + assert.Equal(rt["backend"]["app"], "django") + assert.Equal(rt["backend"]["app_type"], "web") + assert.Equal(rt["database"]["app"], "postgres") + assert.Equal(rt["database"]["app_type"], "db") + default: + t.Fatalf("no data received") + } + + body, err := ioutil.ReadAll(resp.Body) + assert.Nil(err) + assert.Equal("OK\n", string(body)) + case v04: + assert.Equal(200, resp.StatusCode) + + // now we should be able to read the trace data + select { + case rt := <-tc.r.services: + assert.Len(rt, 2) + assert.Equal(rt["backend"]["app"], "django") + assert.Equal(rt["backend"]["app_type"], "web") + assert.Equal(rt["database"]["app"], "postgres") + assert.Equal(rt["database"]["app_type"], "db") + default: + t.Fatalf("no data received") + } + + body, err := ioutil.ReadAll(resp.Body) + assert.Nil(err) + assert.Equal("OK\n", string(body)) + } + + resp.Body.Close() + server.Close() + }) + } +} + +func TestHandleTraces(t *testing.T) { + assert := assert.New(t) + + // prepare the msgpack payload + var buf bytes.Buffer + msgp.Encode(&buf, testutil.GetTestTrace(10, 10, true)) + + // prepare the receiver + conf := newTestReceiverConfig() + receiver := newTestReceiverFromConfig(conf) + + // response recorder + handler := http.HandlerFunc(receiver.httpHandleWithVersion(v04, receiver.handleTraces)) + + for n := 0; n < 10; n++ { + // consume the traces channel without doing anything + select { + case <-receiver.Out: + default: + } + + // forge the request + rr := httptest.NewRecorder() + req, _ := http.NewRequest("POST", "/v0.4/traces", bytes.NewReader(buf.Bytes())) + req.Header.Set("Content-Type", "application/msgpack") + + // Add meta data to simulate data comming from multiple applications + req.Header.Set("Datadog-Meta-Lang", langs[n%len(langs)]) + + handler.ServeHTTP(rr, req) + } + + rs := receiver.Stats + assert.Equal(5, len(rs.Stats)) // We have a tagStats struct for each application + + // We test stats for each app + for _, lang := range langs { + ts, ok := rs.Stats[info.Tags{Lang: lang}] + assert.True(ok) + assert.Equal(int64(20), ts.TracesReceived) + assert.Equal(int64(59222), ts.TracesBytes) + } + // make sure we have all our languages registered + assert.Equal("C#|go|java|python|ruby", receiver.Languages()) +} + +// chunkedReader is a reader which forces partial reads, this is required +// to trigger some network related bugs, such as body not being read fully by server. +// Without this, all the data could be read/written at once, not triggering the issue. +type chunkedReader struct { + reader io.Reader +} + +func (sr *chunkedReader) Read(p []byte) (n int, err error) { + size := 1024 + if size > len(p) { + size = len(p) + } + buf := p[0:size] + return sr.reader.Read(buf) +} + +func TestReceiverPreSamplerCancel(t *testing.T) { + assert := assert.New(t) + + var wg sync.WaitGroup + var buf bytes.Buffer + + n := 100 // Payloads need to be big enough, else bug is not triggered + msgp.Encode(&buf, testutil.GetTestTrace(n, n, true)) + + conf := newTestReceiverConfig() + receiver := newTestReceiverFromConfig(conf) + receiver.PreSampler.SetRate(0.000001) // Make sure we sample aggressively + + server := httptest.NewServer(http.HandlerFunc(receiver.httpHandleWithVersion(v04, receiver.handleTraces))) + + defer server.Close() + url := server.URL + "/v0.4/traces" + + // Make sure we use share clients, and they are reused. + client := &http.Client{Transport: &http.Transport{ + MaxIdleConnsPerHost: 100, + }} + for i := 0; i < 3; i++ { + wg.Add(1) + go func() { + for j := 0; j < 3; j++ { + reader := &chunkedReader{reader: bytes.NewReader(buf.Bytes())} + req, err := http.NewRequest("POST", url, reader) + req.Header.Set("Content-Type", "application/msgpack") + req.Header.Set(sampler.TraceCountHeader, strconv.Itoa(n)) + assert.Nil(err) + + resp, err := client.Do(req) + assert.Nil(err) + assert.NotNil(resp) + if resp != nil { + assert.Equal(http.StatusOK, resp.StatusCode) + } + } + wg.Done() + }() + } + wg.Wait() +} + +func BenchmarkHandleTracesFromOneApp(b *testing.B) { + // prepare the payload + // msgpack payload + var buf bytes.Buffer + msgp.Encode(&buf, testutil.GetTestTrace(1, 1, true)) + + // prepare the receiver + conf := newTestReceiverConfig() + receiver := newTestReceiverFromConfig(conf) + + // response recorder + handler := http.HandlerFunc(receiver.httpHandleWithVersion(v04, receiver.handleTraces)) + + // benchmark + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + b.StopTimer() + // consume the traces channel without doing anything + select { + case <-receiver.Out: + default: + } + + // forge the request + rr := httptest.NewRecorder() + req, _ := http.NewRequest("POST", "/v0.4/traces", bytes.NewReader(buf.Bytes())) + req.Header.Set("Content-Type", "application/msgpack") + + // Add meta data to simulate data comming from multiple applications + for _, v := range headerFields { + req.Header.Set(v, langs[n%len(langs)]) + } + + // trace only this execution + b.StartTimer() + handler.ServeHTTP(rr, req) + } +} + +func BenchmarkHandleTracesFromMultipleApps(b *testing.B) { + // prepare the payload + // msgpack payload + var buf bytes.Buffer + msgp.Encode(&buf, testutil.GetTestTrace(1, 1, true)) + + // prepare the receiver + conf := newTestReceiverConfig() + receiver := newTestReceiverFromConfig(conf) + + // response recorder + handler := http.HandlerFunc(receiver.httpHandleWithVersion(v04, receiver.handleTraces)) + + // benchmark + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + b.StopTimer() + // consume the traces channel without doing anything + select { + case <-receiver.Out: + default: + } + + // forge the request + rr := httptest.NewRecorder() + req, _ := http.NewRequest("POST", "/v0.4/traces", bytes.NewReader(buf.Bytes())) + req.Header.Set("Content-Type", "application/msgpack") + + // Add meta data to simulate data comming from multiple applications + for _, v := range headerFields { + req.Header.Set(v, langs[n%len(langs)]) + } + + // trace only this execution + b.StartTimer() + handler.ServeHTTP(rr, req) + } +} + +func BenchmarkDecoderJSON(b *testing.B) { + assert := assert.New(b) + traces := testutil.GetTestTrace(150, 66, true) + + // json payload + payload, err := json.Marshal(traces) + assert.Nil(err) + + // benchmark + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + b.StopTimer() + reader := bytes.NewReader(payload) + + b.StartTimer() + var spans pb.Traces + decoder := json.NewDecoder(reader) + _ = decoder.Decode(&spans) + } +} + +func BenchmarkDecoderMsgpack(b *testing.B) { + assert := assert.New(b) + + // msgpack payload + var buf bytes.Buffer + err := msgp.Encode(&buf, testutil.GetTestTrace(150, 66, true)) + assert.Nil(err) + + // benchmark + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + b.StopTimer() + reader := bytes.NewReader(buf.Bytes()) + + b.StartTimer() + var traces pb.Traces + _ = msgp.Decode(reader, &traces) + } +} diff --git a/pkg/trace/api/limited_reader.go b/pkg/trace/api/limited_reader.go new file mode 100644 index 0000000000000..c647ac323ba24 --- /dev/null +++ b/pkg/trace/api/limited_reader.go @@ -0,0 +1,57 @@ +package api + +import ( + "errors" + "io" +) + +// ErrLimitedReaderLimitReached indicates that the read limit has been +// reached. +var ErrLimitedReaderLimitReached = errors.New("read limit reached") + +// LimitedReader reads from a reader up to a specific limit. When this limit +// has been reached, any subsequent read will return +// ErrLimitedReaderLimitReached. +// The underlying reader has to implement io.ReadCloser so that it can be used +// with http request bodies. +type LimitedReader struct { + r io.ReadCloser + limit int64 + Count int64 +} + +// NewLimitedReader creates a new LimitedReader. +func NewLimitedReader(r io.ReadCloser, limit int64) *LimitedReader { + return &LimitedReader{ + r: r, + limit: limit, + } +} + +// Read reads from the underlying reader. +func (r *LimitedReader) Read(buf []byte) (n int, err error) { + if r.limit <= 0 { + return 0, ErrLimitedReaderLimitReached + } + + if int64(len(buf)) > r.limit { + buf = buf[0:r.limit] + } + n, err = r.r.Read(buf) + + // Some libraries (e.g. msgp) will ignore read data if err is not nil. + // We reset err if something was read, and the next read will return + // io.EOF with no data. + if err == io.EOF && n > 0 { + err = nil + } + + r.limit -= int64(n) + r.Count += int64(n) + return +} + +// Close closes the underlying reader. +func (r *LimitedReader) Close() error { + return r.r.Close() +} diff --git a/pkg/trace/api/limited_reader_test.go b/pkg/trace/api/limited_reader_test.go new file mode 100644 index 0000000000000..c3a2d8299dd84 --- /dev/null +++ b/pkg/trace/api/limited_reader_test.go @@ -0,0 +1,105 @@ +package api + +import ( + "bytes" + "io" + "io/ioutil" + "testing" + + "github.com/stretchr/testify/assert" +) + +// fileMock simulates a file which can return both io.EOF and a byte count +// greater than 0. +type fileMock struct { + data []byte +} + +func newFileMock(data []byte) *fileMock { + return &fileMock{data: data} +} + +func (f *fileMock) Read(buf []byte) (n int, err error) { + n = len(f.data) + err = nil + + if n > cap(buf) { + n = cap(buf) + } + + if n == len(f.data) { + err = io.EOF + } + + copy(buf, f.data[:n]) + f.data = f.data[n:] + + return +} + +func (f *fileMock) Close() error { + f.data = nil + return nil +} + +func TestLimitedReader(t *testing.T) { + buf := bytes.NewBufferString("foobar") + r := ioutil.NopCloser(buf) + lr := NewLimitedReader(r, 3) + + tmp := make([]byte, 1) + n, err := lr.Read(tmp) + assert.Nil(t, err) + assert.Equal(t, 1, n) + assert.Equal(t, []byte("f"), tmp) + assert.Equal(t, int64(1), lr.Count) + + tmp = make([]byte, 4) + n, err = lr.Read(tmp) + assert.Nil(t, err) + assert.Equal(t, 2, n) + assert.Equal(t, []byte("oo\x00\x00"), tmp) + assert.Equal(t, int64(3), lr.Count) + + tmp = make([]byte, 1) + n, err = lr.Read(tmp) + assert.Equal(t, ErrLimitedReaderLimitReached, err) + assert.Equal(t, int64(3), lr.Count) +} + +func TestLimitedReaderEOFBuffer(t *testing.T) { + buf := bytes.NewBufferString("foobar") + r := ioutil.NopCloser(buf) + lr := NewLimitedReader(r, 12) + + tmp := make([]byte, 6) + n, err := lr.Read(tmp) + assert.Nil(t, err) + assert.Equal(t, 6, n) + assert.Equal(t, []byte("foobar"), tmp) + assert.Equal(t, int64(6), lr.Count) + + tmp = make([]byte, 6) + n, err = lr.Read(tmp) + assert.Equal(t, io.EOF, err) + assert.Equal(t, 0, n) + assert.Equal(t, int64(6), lr.Count) +} + +func TestLimitedReaderEOFMockFile(t *testing.T) { + file := newFileMock([]byte("foobar")) + lr := NewLimitedReader(file, 12) + + tmp := make([]byte, 6) + n, err := lr.Read(tmp) + assert.Nil(t, err) + assert.Equal(t, 6, n) + assert.Equal(t, []byte("foobar"), tmp) + assert.Equal(t, int64(6), lr.Count) + + tmp = make([]byte, 6) + n, err = lr.Read(tmp) + assert.Equal(t, io.EOF, err) + assert.Equal(t, 0, n) + assert.Equal(t, int64(6), lr.Count) +} diff --git a/pkg/trace/api/listener.go b/pkg/trace/api/listener.go new file mode 100644 index 0000000000000..6f52ef03cb47f --- /dev/null +++ b/pkg/trace/api/listener.go @@ -0,0 +1,80 @@ +package api + +import ( + "errors" + "net" + "sync/atomic" + "time" + + log "github.com/cihub/seelog" +) + +// rateLimitedListener wraps a regular TCPListener with rate limiting. +type rateLimitedListener struct { + connLease int32 // How many connections are available for this listener before rate-limiting kicks in + *net.TCPListener +} + +// newRateLimitedListener returns a new wrapped listener, which is non-initialized +func newRateLimitedListener(l net.Listener, conns int) (*rateLimitedListener, error) { + tcpL, ok := l.(*net.TCPListener) + + if !ok { + return nil, errors.New("cannot wrap listener") + } + + sl := &rateLimitedListener{connLease: int32(conns), TCPListener: tcpL} + + return sl, nil +} + +// Refresh periodically refreshes the connection lease, and thus cancels any rate limits in place +func (sl *rateLimitedListener) Refresh(conns int) { + for range time.Tick(30 * time.Second) { + atomic.StoreInt32(&sl.connLease, int32(conns)) + log.Debugf("Refreshed the connection lease: %d conns available", conns) + } +} + +// rateLimitedError indicates a user request being blocked by our rate limit +// It satisfies the net.Error interface +type rateLimitedError struct{} + +// Error returns an error string +func (e *rateLimitedError) Error() string { return "request has been rate-limited" } + +// Temporary tells the HTTP server loop that this error is temporary and recoverable +func (e *rateLimitedError) Temporary() bool { return true } + +// Timeout tells the HTTP server loop that this error is not a timeout +func (e *rateLimitedError) Timeout() bool { return false } + +// Accept reimplements the regular Accept but adds rate limiting. +func (sl *rateLimitedListener) Accept() (net.Conn, error) { + if atomic.LoadInt32(&sl.connLease) <= 0 { + // we've reached our cap for this lease period, reject the request + return nil, &rateLimitedError{} + } + + for { + //Wait up to 1 second for Reads and Writes to the new connection + sl.SetDeadline(time.Now().Add(time.Second)) + + newConn, err := sl.TCPListener.Accept() + + if err != nil { + netErr, ok := err.(net.Error) + + //If this is a timeout, then continue to wait for + //new connections + if ok && netErr.Timeout() && netErr.Temporary() { + continue + } + } + + // decrement available conns + atomic.AddInt32(&sl.connLease, -1) + + return newConn, err + } +} diff --git a/pkg/trace/api/logger.go b/pkg/trace/api/logger.go new file mode 100644 index 0000000000000..d5ff0752c8b40 --- /dev/null +++ b/pkg/trace/api/logger.go @@ -0,0 +1,38 @@ +package api + +import ( + "sync" + + log "github.com/cihub/seelog" +) + +// these could be configurable, but fine with hardcoded for now +var maxPerInterval int64 = 10 + +type errorLogger struct { + errors int64 + sync.Mutex +} + +func (l *errorLogger) Errorf(format string, params ...interface{}) { + l.Lock() + + if l.errors < maxPerInterval { + log.Errorf(format, params...) + } + if l.errors == maxPerInterval { + log.Infof("too many error messages to display, skipping output till next minute") + } + + l.errors++ + l.Unlock() +} + +func (l *errorLogger) Reset() { + l.Lock() + if l.errors > maxPerInterval { + log.Infof("skipped %d error messages", l.errors-maxPerInterval) + } + l.errors = 0 + l.Unlock() +} diff --git a/pkg/trace/api/responses.go b/pkg/trace/api/responses.go new file mode 100644 index 0000000000000..2798c68de4852 --- /dev/null +++ b/pkg/trace/api/responses.go @@ -0,0 +1,74 @@ +package api + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +const ( + receiverErrorKey = "datadog.trace_agent.receiver.error" +) + +// We encaspulate the answers in a container, this is to ease-up transition, +// should we add another fied. +type traceResponse struct { + // All the sampling rates recommended, by service + Rates map[string]float64 `json:"rate_by_service"` +} + +// HTTPFormatError is used for payload format errors +func HTTPFormatError(tags []string, w http.ResponseWriter) { + tags = append(tags, "error:format-error") + metrics.Count(receiverErrorKey, 1, tags, 1) + http.Error(w, "format-error", http.StatusUnsupportedMediaType) +} + +// HTTPDecodingError is used for errors happening in decoding +func HTTPDecodingError(err error, tags []string, w http.ResponseWriter) { + status := http.StatusBadRequest + errtag := "decoding-error" + msg := err.Error() + + if err == ErrLimitedReaderLimitReached { + status = http.StatusRequestEntityTooLarge + errtag := "payload-too-large" + msg = errtag + } + + tags = append(tags, fmt.Sprintf("error:%s", errtag)) + metrics.Count(receiverErrorKey, 1, tags, 1) + + http.Error(w, msg, status) +} + +// HTTPEndpointNotSupported is for payloads getting sent to a wrong endpoint +func HTTPEndpointNotSupported(tags []string, w http.ResponseWriter) { + tags = append(tags, "error:unsupported-endpoint") + metrics.Count(receiverErrorKey, 1, tags, 1) + http.Error(w, "unsupported-endpoint", http.StatusInternalServerError) +} + +// HTTPOK is a dumb response for when things are a OK +func HTTPOK(w http.ResponseWriter) { + w.WriteHeader(http.StatusOK) + io.WriteString(w, "OK\n") +} + +// HTTPRateByService outputs, as a JSON, the recommended sampling rates for all services. +func HTTPRateByService(w http.ResponseWriter, dynConf *sampler.DynamicConfig) { + w.WriteHeader(http.StatusOK) + response := traceResponse{ + Rates: dynConf.RateByService.GetAll(), // this is thread-safe + } + encoder := json.NewEncoder(w) + if err := encoder.Encode(response); err != nil { + tags := []string{"error:response-error"} + metrics.Count(receiverErrorKey, 1, tags, 1) + return + } +} diff --git a/pkg/trace/config/README.md b/pkg/trace/config/README.md new file mode 100644 index 0000000000000..3ae56cb994f39 --- /dev/null +++ b/pkg/trace/config/README.md @@ -0,0 +1,28 @@ +# Agent Configuration + +The trace-agent sources configuration from the following locations: + +1. The Datadog Agent configuration file, provided to the `-config` command line flag (default: `/etc/datadog/datadog.conf`) +2. Environment variables: See full list below + +Environment variables will override settings defined in configuration files. + +## File configuration + +Refer to the [Datadog Agent example configuration](https://github.com/DataDog/dd-agent/blob/master/datadog.conf.example) to see all available options. + + +## Environment variables +We allow overriding a subset of configuration values from the environment. These +can be useful when running the agent in a Docker container or in other situations +where env vars are preferrable to static files + +- `DD_APM_ENABLED` - overrides `[Main] apm_enabled` +- `DD_HOSTNAME` - overrides `[Main] hostname` +- `DD_API_KEY` - overrides `[Main] api_key` +- `DD_DOGSTATSD_PORT` - overrides `[Main] dogstatsd_port` +- `DD_BIND_HOST` - overrides `[Main] bind_host` +- `DD_APM_NON_LOCAL_TRAFFIC` - overrides `[Main] non_local_traffic` +- `DD_LOG_LEVEL` - overrides `[Main] log_level` +- `DD_RECEIVER_PORT` - overrides `[trace.receiver] receiver_port` +- `DD_IGNORE_RESOURCE` - overrides `[trace.ignore] resource` diff --git a/pkg/trace/config/apply.go b/pkg/trace/config/apply.go new file mode 100644 index 0000000000000..2c63ea90cac03 --- /dev/null +++ b/pkg/trace/config/apply.go @@ -0,0 +1,468 @@ +package config + +import ( + "encoding/csv" + "errors" + "fmt" + "net/url" + "regexp" + "strings" + "time" + + "github.com/DataDog/datadog-agent/pkg/config" + "github.com/DataDog/datadog-agent/pkg/trace/osutil" + "github.com/DataDog/datadog-agent/pkg/trace/writer/backoff" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + log "github.com/cihub/seelog" +) + +// apiEndpointPrefix is the URL prefix prepended to the default site value from YamlAgentConfig. +const apiEndpointPrefix = "https://trace.agent." + +// ObfuscationConfig holds the configuration for obfuscating sensitive data +// for various span types. +type ObfuscationConfig struct { + // ES holds the obfuscation configuration for ElasticSearch bodies. + ES JSONObfuscationConfig `mapstructure:"elasticsearch"` + + // Mongo holds the obfuscation configuration for MongoDB queries. + Mongo JSONObfuscationConfig `mapstructure:"mongodb"` + + // HTTP holds the obfuscation settings for HTTP URLs. + HTTP HTTPObfuscationConfig `mapstructure:"http"` + + // RemoveStackTraces specifies whether stack traces should be removed. + // More specifically "error.stack" tag values will be cleared. + RemoveStackTraces bool `mapstructure:"remove_stack_traces"` + + // Redis holds the configuration for obfuscating the "redis.raw_command" tag + // for spans of type "redis". + Redis Enablable `mapstructure:"redis"` + + // Memcached holds the configuration for obfuscating the "memcached.command" tag + // for spans of type "memcached". + Memcached Enablable `mapstructure:"memcached"` +} + +// HTTPObfuscationConfig holds the configuration settings for HTTP obfuscation. +type HTTPObfuscationConfig struct { + // RemoveQueryStrings determines query strings to be removed from HTTP URLs. + RemoveQueryString bool `mapstructure:"remove_query_string"` + + // RemovePathDigits determines digits in path segments to be obfuscated. + RemovePathDigits bool `mapstructure:"remove_paths_with_digits"` +} + +// Enablable can represent any option that has an "enabled" boolean sub-field. +type Enablable struct { + Enabled bool `mapstructure:"enabled"` +} + +// JSONObfuscationConfig holds the obfuscation configuration for sensitive +// data found in JSON objects. +type JSONObfuscationConfig struct { + // Enabled will specify whether obfuscation should be enabled. + Enabled bool `mapstructure:"enabled"` + + // KeepValues will specify a set of keys for which their values will + // not be obfuscated. + KeepValues []string `mapstructure:"keep_values"` +} + +type ReplaceRule struct { + // Name specifies the name of the tag that the replace rule addresses. However, + // some exceptions apply such as: + // • "resource.name" will target the resource + // • "*" will target all tags and the resource + Name string `mapstructure:"name"` + + // Pattern specifies the regexp pattern to be used when replacing. It must compile. + Pattern string `mapstructure:"pattern"` + + // Re holds the compiled Pattern and is only used internally. + Re *regexp.Regexp `mapstructure:"-"` + + // Repl specifies the replacement string to be used when Pattern matches. + Repl string `mapstructure:"repl"` +} + +type traceWriter struct { + MaxSpansPerPayload int `mapstructure:"max_spans_per_payload"` + FlushPeriod float64 `mapstructure:"flush_period_seconds"` + UpdateInfoPeriod int `mapstructure:"update_info_period_seconds"` + QueueablePayloadSender queueablePayloadSender `mapstructure:"queue"` +} + +type serviceWriter struct { + UpdateInfoPeriod int `mapstructure:"update_info_period_seconds"` + FlushPeriod int `mapstructure:"flush_period_seconds"` + QueueablePayloadSender queueablePayloadSender `mapstructure:"queue"` +} + +type statsWriter struct { + MaxEntriesPerPayload int `mapstructure:"max_entries_per_payload"` + UpdateInfoPeriod int `mapstructure:"update_info_period_seconds"` + QueueablePayloadSender queueablePayloadSender `mapstructure:"queue"` +} + +type queueablePayloadSender struct { + MaxAge int `mapstructure:"max_age_seconds"` + MaxQueuedBytes int64 `mapstructure:"max_bytes"` + MaxQueuedPayloads int `mapstructure:"max_payloads"` + BackoffDuration int `mapstructure:"exp_backoff_max_duration_seconds"` + BackoffBase int `mapstructure:"exp_backoff_base_milliseconds"` + BackoffGrowth int `mapstructure:"exp_backoff_growth_base"` +} + +func (c *AgentConfig) applyDatadogConfig() error { + if len(c.Endpoints) == 0 { + c.Endpoints = []*Endpoint{{}} + } + if config.Datadog.IsSet("api_key") { + c.Endpoints[0].APIKey = config.Datadog.GetString("api_key") + } + if config.Datadog.IsSet("hostname") { + c.Hostname = config.Datadog.GetString("hostname") + } + if config.Datadog.IsSet("log_level") { + c.LogLevel = config.Datadog.GetString("log_level") + } + if config.Datadog.IsSet("dogstatsd_port") { + c.StatsdPort = config.Datadog.GetInt("dogstatsd_port") + } + + site := config.Datadog.GetString("site") + if site != "" { + c.Endpoints[0].Host = apiEndpointPrefix + site + } + if host := config.Datadog.GetString("apm_config.apm_dd_url"); host != "" { + c.Endpoints[0].Host = host + if site != "" { + log.Infof("'site' and 'apm_dd_url' are both set, using endpoint: %q", host) + } + } + for url, keys := range config.Datadog.GetStringMapStringSlice("apm_config.additional_endpoints") { + if len(keys) == 0 { + log.Errorf("'additional_endpoints' entries must have at least one API key present") + continue + } + for _, key := range keys { + c.Endpoints = append(c.Endpoints, &Endpoint{Host: url, APIKey: key}) + } + } + + proxyList := config.Datadog.GetStringSlice("proxy.no_proxy") + noProxy := make(map[string]bool, len(proxyList)) + for _, host := range proxyList { + // map of hosts that need to be skipped by proxy + noProxy[host] = true + } + for _, e := range c.Endpoints { + e.NoProxy = noProxy[e.Host] + } + if addr := config.Datadog.GetString("proxy.https"); addr != "" { + url, err := url.Parse(addr) + if err == nil { + c.ProxyURL = url + } else { + log.Errorf("Failed to parse proxy URL from proxy.https configuration: %s", err) + } + } + + if config.Datadog.IsSet("skip_ssl_validation") { + c.SkipSSLValidation = config.Datadog.GetBool("skip_ssl_validation") + } + if config.Datadog.IsSet("apm_config.enabled") { + c.Enabled = config.Datadog.GetBool("apm_config.enabled") + } + if config.Datadog.IsSet("apm_config.log_file") { + c.LogFilePath = config.Datadog.GetString("apm_config.log_file") + } + if config.Datadog.IsSet("apm_config.env") { + c.DefaultEnv = config.Datadog.GetString("apm_config.env") + } + if config.Datadog.IsSet("apm_config.receiver_port") { + c.ReceiverPort = config.Datadog.GetInt("apm_config.receiver_port") + } + if config.Datadog.IsSet("apm_config.connection_limit") { + c.ConnectionLimit = config.Datadog.GetInt("apm_config.connection_limit") + } + if config.Datadog.IsSet("apm_config.extra_sample_rate") { + c.ExtraSampleRate = config.Datadog.GetFloat64("apm_config.extra_sample_rate") + } + if config.Datadog.IsSet("apm_config.max_events_per_second") { + c.MaxEPS = config.Datadog.GetFloat64("apm_config.max_events_per_second") + } + if config.Datadog.IsSet("apm_config.max_traces_per_second") { + c.MaxTPS = config.Datadog.GetFloat64("apm_config.max_traces_per_second") + } + if config.Datadog.IsSet("apm_config.ignore_resources") { + c.Ignore["resource"] = config.Datadog.GetStringSlice("apm_config.ignore_resources") + } + + if config.Datadog.IsSet("apm_config.replace_tags") { + rt := make([]*ReplaceRule, 0) + err := config.Datadog.UnmarshalKey("apm_config.replace_tags", &rt) + if err == nil { + err := compileReplaceRules(rt) + if err != nil { + osutil.Exitf("replace_tags: %s", err) + } + c.ReplaceTags = rt + } + } + + if config.Datadog.IsSet("bind_host") { + host := config.Datadog.GetString("bind_host") + c.StatsdHost = host + c.ReceiverHost = host + } + if config.Datadog.IsSet("apm_config.apm_non_local_traffic") { + if config.Datadog.GetBool("apm_config.apm_non_local_traffic") { + c.ReceiverHost = "0.0.0.0" + } + } + + if config.Datadog.IsSet("apm_config.obfuscation") { + var o ObfuscationConfig + err := config.Datadog.UnmarshalKey("apm_config.obfuscation", &o) + if err == nil { + c.Obfuscation = &o + if c.Obfuscation.RemoveStackTraces { + c.addReplaceRule("error.stack", `(?s).*`, "?") + } + } + } + + // undocumented + if config.Datadog.IsSet("apm_config.max_cpu_percent") { + c.MaxCPU = config.Datadog.GetFloat64("apm_config.max_cpu_percent") / 100 + } + if config.Datadog.IsSet("apm_config.max_memory") { + c.MaxMemory = config.Datadog.GetFloat64("apm_config.max_memory") + } + if config.Datadog.IsSet("apm_config.max_connections") { + c.MaxConnections = config.Datadog.GetInt("apm_config.max_connections") + } + + // undocumented + c.ServiceWriterConfig = readServiceWriterConfigYaml() + c.StatsWriterConfig = readStatsWriterConfigYaml() + c.TraceWriterConfig = readTraceWriterConfigYaml() + + // undocumented deprecated + if config.Datadog.IsSet("apm_config.analyzed_rate_by_service") { + rateByService := make(map[string]float64) + if err := config.Datadog.UnmarshalKey("apm_config.analyzed_rate_by_service", &rateByService); err != nil { + return err + } + c.AnalyzedRateByServiceLegacy = rateByService + if len(rateByService) > 0 { + log.Warn("analyzed_rate_by_service is deprecated, please use analyzed_spans instead") + } + } + // undocumeted + if config.Datadog.IsSet("apm_config.analyzed_spans") { + rateBySpan := make(map[string]float64) + if err := config.Datadog.UnmarshalKey("apm_config.analyzed_spans", &rateBySpan); err != nil { + return err + } + for key, rate := range rateBySpan { + serviceName, operationName, err := parseServiceAndOp(key) + if err != nil { + log.Errorf("Error when parsing names", err) + continue + } + + if _, ok := c.AnalyzedSpansByService[serviceName]; !ok { + c.AnalyzedSpansByService[serviceName] = make(map[string]float64) + } + c.AnalyzedSpansByService[serviceName][operationName] = rate + } + } + + // undocumented + if config.Datadog.IsSet("apm_config.dd_agent_bin") { + c.DDAgentBin = config.Datadog.GetString("apm_config.dd_agent_bin") + } + + return c.loadDeprecatedValues() +} + +// loadDeprecatedValues loads a set of deprecated values which are kept for +// backwards compatibility with Agent 5. These should eventually be removed. +// TODO(x): remove them gradually or fully in a future release. +func (c *AgentConfig) loadDeprecatedValues() error { + cfg := config.Datadog + if cfg.IsSet("apm_config.api_key") { + c.Endpoints[0].APIKey = config.Datadog.GetString("apm_config.api_key") + } + if cfg.IsSet("apm_config.log_level") { + c.LogLevel = config.Datadog.GetString("apm_config.log_level") + } + if v := cfg.GetString("apm_config.extra_aggregators"); len(v) > 0 { + aggs, err := splitString(v, ',') + if err != nil { + return err + } + c.ExtraAggregators = append(c.ExtraAggregators, aggs...) + } + if !cfg.GetBool("apm_config.log_throttling") { + c.LogThrottlingEnabled = false + } + if cfg.IsSet("apm_config.bucket_size_seconds") { + d := time.Duration(cfg.GetInt("apm_config.bucket_size_seconds")) + c.BucketInterval = d * time.Second + } + if cfg.IsSet("apm_config.receiver_timeout") { + c.ReceiverTimeout = cfg.GetInt("apm_config.receiver_timeout") + } + if cfg.IsSet("apm_config.watchdog_check_delay") { + d := time.Duration(cfg.GetInt("apm_config.watchdog_check_delay")) + c.WatchdogInterval = d * time.Second + } + return nil +} + +// addReplaceRule adds the specified replace rule to the agent configuration. If the pattern fails +// to compile as valid regexp, it exits the application with status code 1. +func (c *AgentConfig) addReplaceRule(tag, pattern, repl string) { + re, err := regexp.Compile(pattern) + if err != nil { + osutil.Exitf("error adding replace rule: %s", err) + } + c.ReplaceTags = append(c.ReplaceTags, &ReplaceRule{ + Name: tag, + Pattern: pattern, + Re: re, + Repl: repl, + }) +} + +func readServiceWriterConfigYaml() writerconfig.ServiceWriterConfig { + w := serviceWriter{} + c := writerconfig.DefaultServiceWriterConfig() + + if err := config.Datadog.UnmarshalKey("apm_config.service_writer", &w); err == nil { + if w.FlushPeriod > 0 { + c.FlushPeriod = getDuration(w.FlushPeriod) + } + if w.UpdateInfoPeriod > 0 { + c.UpdateInfoPeriod = getDuration(w.UpdateInfoPeriod) + } + c.SenderConfig = readQueueablePayloadSenderConfigYaml(w.QueueablePayloadSender) + } + return c +} + +func readStatsWriterConfigYaml() writerconfig.StatsWriterConfig { + w := statsWriter{} + c := writerconfig.DefaultStatsWriterConfig() + + if err := config.Datadog.UnmarshalKey("apm_config.stats_writer", &w); err == nil { + if w.MaxEntriesPerPayload > 0 { + c.MaxEntriesPerPayload = w.MaxEntriesPerPayload + } + if w.UpdateInfoPeriod > 0 { + c.UpdateInfoPeriod = getDuration(w.UpdateInfoPeriod) + } + c.SenderConfig = readQueueablePayloadSenderConfigYaml(w.QueueablePayloadSender) + } + return c +} + +func readTraceWriterConfigYaml() writerconfig.TraceWriterConfig { + w := traceWriter{} + c := writerconfig.DefaultTraceWriterConfig() + + if err := config.Datadog.UnmarshalKey("apm_config.trace_writer", &w); err == nil { + if w.MaxSpansPerPayload > 0 { + c.MaxSpansPerPayload = w.MaxSpansPerPayload + } + if w.FlushPeriod > 0 { + c.FlushPeriod = time.Duration(w.FlushPeriod*1000) * time.Millisecond + } + if w.UpdateInfoPeriod > 0 { + c.UpdateInfoPeriod = getDuration(w.UpdateInfoPeriod) + } + c.SenderConfig = readQueueablePayloadSenderConfigYaml(w.QueueablePayloadSender) + } + return c +} + +func readQueueablePayloadSenderConfigYaml(yc queueablePayloadSender) writerconfig.QueuablePayloadSenderConf { + c := writerconfig.DefaultQueuablePayloadSenderConf() + + if yc.MaxAge != 0 { + c.MaxAge = getDuration(yc.MaxAge) + } + + if yc.MaxQueuedBytes != 0 { + c.MaxQueuedBytes = yc.MaxQueuedBytes + } + + if yc.MaxQueuedPayloads != 0 { + c.MaxQueuedPayloads = yc.MaxQueuedPayloads + } + + c.ExponentialBackoff = readExponentialBackoffConfigYaml(yc) + + return c +} + +func readExponentialBackoffConfigYaml(yc queueablePayloadSender) backoff.ExponentialConfig { + c := backoff.DefaultExponentialConfig() + + if yc.BackoffDuration > 0 { + c.MaxDuration = getDuration(yc.BackoffDuration) + } + if yc.BackoffBase > 0 { + c.Base = time.Duration(yc.BackoffBase) * time.Millisecond + } + if yc.BackoffGrowth > 0 { + c.GrowthBase = yc.BackoffGrowth + } + + return c +} + +// compileReplaceRules compiles the regular expressions found in the replace rules. +// If it fails it returns the first error. +func compileReplaceRules(rules []*ReplaceRule) error { + for _, r := range rules { + if r.Name == "" { + return errors.New(`all rules must have a "name" property (use "*" to target all)`) + } + if r.Pattern == "" { + return errors.New(`all rules must have a "pattern"`) + } + re, err := regexp.Compile(r.Pattern) + if err != nil { + return fmt.Errorf("key %q: %s", r.Name, err) + } + r.Re = re + } + return nil +} + +// getDuration returns the duration of the provided value in seconds +func getDuration(seconds int) time.Duration { + return time.Duration(seconds) * time.Second +} + +func parseServiceAndOp(name string) (string, string, error) { + splits := strings.Split(name, "|") + if len(splits) != 2 { + return "", "", fmt.Errorf("Bad format for operation name and service name in: %s, it should have format: service_name|operation_name", name) + } + return splits[0], splits[1], nil +} + +func splitString(s string, sep rune) ([]string, error) { + r := csv.NewReader(strings.NewReader(s)) + r.TrimLeadingSpace = true + r.LazyQuotes = true + r.Comma = sep + + return r.Read() +} diff --git a/pkg/trace/config/apply_test.go b/pkg/trace/config/apply_test.go new file mode 100644 index 0000000000000..90a5aa1e923ed --- /dev/null +++ b/pkg/trace/config/apply_test.go @@ -0,0 +1,24 @@ +package config + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestParseReplaceRules tests the compileReplaceRules helper function. +func TestParseRepaceRules(t *testing.T) { + assert := assert.New(t) + rules := []*ReplaceRule{ + {Name: "http.url", Pattern: "(token/)([^/]*)", Repl: "${1}?"}, + {Name: "http.url", Pattern: "guid", Repl: "[REDACTED]"}, + {Name: "custom.tag", Pattern: "(/foo/bar/).*", Repl: "${1}extra"}, + } + err := compileReplaceRules(rules) + if err != nil { + t.Fatal(err) + } + for _, r := range rules { + assert.Equal(r.Pattern, r.Re.String()) + } +} diff --git a/pkg/trace/config/config.go b/pkg/trace/config/config.go new file mode 100644 index 0000000000000..ff2bd8c3ccd7f --- /dev/null +++ b/pkg/trace/config/config.go @@ -0,0 +1,250 @@ +package config + +import ( + "bytes" + "errors" + "net/url" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/DataDog/datadog-agent/pkg/config" + "github.com/DataDog/datadog-agent/pkg/config/legacy" + "github.com/DataDog/datadog-agent/pkg/trace/flags" + "github.com/DataDog/datadog-agent/pkg/trace/osutil" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + log "github.com/cihub/seelog" +) + +var ( + // ErrMissingAPIKey is returned when the config could not be validated due to missing API key. + ErrMissingAPIKey = errors.New("you must specify an API Key, either via a configuration file or the DD_API_KEY env var") + + // ErrMissingHostname is returned when the config could not be validated due to missing hostname. + ErrMissingHostname = errors.New("failed to automatically set the hostname, you must specify it via configuration for or the DD_HOSTNAME env var") +) + +// Endpoint specifies an endpoint that the trace agent will write data (traces, stats & services) to. +type Endpoint struct { + APIKey string `json:"-"` // never marshal this + Host string + + // NoProxy will be set to true when the proxy setting for the trace API endpoint + // needs to be ignored (e.g. it is part of the "no_proxy" list in the yaml settings). + NoProxy bool +} + +// AgentConfig handles the interpretation of the configuration (with default +// behaviors) in one place. It is also a simple structure to share across all +// the Agent components, with 100% safe and reliable values. +// It is exposed with expvar, so make sure to exclude any sensible field +// from JSON encoding. Use New() to create an instance. +type AgentConfig struct { + Enabled bool + + // Global + Hostname string + DefaultEnv string // the traces will default to this environment + ConfigPath string // the source of this config, if any + + // Endpoints specifies the set of hosts and API keys where traces and stats + // will be uploaded to. The first endpoint is the main configuration endpoint; + // any following ones are read from the 'additional_endpoints' parts of the + // configuration file, if present. + Endpoints []*Endpoint + + // Concentrator + BucketInterval time.Duration // the size of our pre-aggregation per bucket + ExtraAggregators []string + + // Sampler configuration + ExtraSampleRate float64 + MaxTPS float64 + MaxEPS float64 + + // Receiver + ReceiverHost string + ReceiverPort int + ConnectionLimit int // for rate-limiting, how many unique connections to allow in a lease period (30s) + ReceiverTimeout int + + // Writers + ServiceWriterConfig writerconfig.ServiceWriterConfig + StatsWriterConfig writerconfig.StatsWriterConfig + TraceWriterConfig writerconfig.TraceWriterConfig + + // internal telemetry + StatsdHost string + StatsdPort int + + // logging + LogLevel string + LogFilePath string + LogThrottlingEnabled bool + + // watchdog + MaxMemory float64 // MaxMemory is the threshold (bytes allocated) above which program panics and exits, to be restarted + MaxCPU float64 // MaxCPU is the max UserAvg CPU the program should consume + MaxConnections int // (deprecated) MaxConnections is the threshold (opened TCP connections) above which program panics and exits, to be restarted + WatchdogInterval time.Duration // WatchdogInterval is the delay between 2 watchdog checks + + // http/s proxying + ProxyURL *url.URL + SkipSSLValidation bool + + // filtering + Ignore map[string][]string + + // ReplaceTags is used to filter out sensitive information from tag values. + // It maps tag keys to a set of replacements. Only supported in A6. + ReplaceTags []*ReplaceRule + + // transaction analytics + AnalyzedRateByServiceLegacy map[string]float64 + AnalyzedSpansByService map[string]map[string]float64 + + // infrastructure agent binary + DDAgentBin string // DDAgentBin will be "" for Agent5 scenarios + + // Obfuscation holds sensitive data obufscator's configuration. + Obfuscation *ObfuscationConfig +} + +// New returns a configuration with the default values. +func New() *AgentConfig { + return &AgentConfig{ + Enabled: true, + DefaultEnv: "none", + Endpoints: []*Endpoint{{Host: "https://trace.agent.datadoghq.com"}}, + + BucketInterval: time.Duration(10) * time.Second, + ExtraAggregators: []string{"http.status_code"}, + + ExtraSampleRate: 1.0, + MaxTPS: 10, + MaxEPS: 200, + + ReceiverHost: "localhost", + ReceiverPort: 8126, + ConnectionLimit: 2000, + + ServiceWriterConfig: writerconfig.DefaultServiceWriterConfig(), + StatsWriterConfig: writerconfig.DefaultStatsWriterConfig(), + TraceWriterConfig: writerconfig.DefaultTraceWriterConfig(), + + StatsdHost: "localhost", + StatsdPort: 8125, + + LogLevel: "INFO", + LogFilePath: DefaultLogFilePath, + LogThrottlingEnabled: true, + + MaxMemory: 5e8, // 500 Mb, should rarely go above 50 Mb + MaxCPU: 0.5, // 50%, well behaving agents keep below 5% + MaxConnections: 200, // in practice, rarely goes over 20 + WatchdogInterval: time.Minute, + + Ignore: make(map[string][]string), + AnalyzedRateByServiceLegacy: make(map[string]float64), + AnalyzedSpansByService: make(map[string]map[string]float64), + } +} + +// Validate validates if the current configuration is good for the agent to start with. +func (c *AgentConfig) validate() error { + if len(c.Endpoints) == 0 || c.Endpoints[0].APIKey == "" { + return ErrMissingAPIKey + } + if c.Hostname == "" { + if err := c.acquireHostname(); err != nil { + return err + } + } + return nil +} + +// fallbackHostnameFunc specifies the function to use for obtaining the hostname +// when it can not be obtained by any other means. It is replaced in tests. +var fallbackHostnameFunc = os.Hostname + +// acquireHostname attempts to acquire a hostname for this configuration. It +// tries to shell out to the infrastructure agent for this, if DD_AGENT_BIN is +// set, otherwise falling back to os.Hostname. +func (c *AgentConfig) acquireHostname() error { + var cmd *exec.Cmd + if c.DDAgentBin != "" { + // Agent 6 + cmd = exec.Command(c.DDAgentBin, "hostname") + cmd.Env = []string{} + } else { + // Most likely Agent 5. Try and obtain the hostname using the Agent's + // Python environment, which will cover several additional installation + // scenarios such as GCE, EC2, Kube, Docker, etc. In these scenarios + // Go's os.Hostname will not be able to obtain the correct host. Do not + // remove! + cmd = exec.Command(defaultDDAgentPy, "-c", "from utils.hostname import get_hostname; print get_hostname()") + cmd.Env = []string{defaultDDAgentPyEnv} + } + var out bytes.Buffer + cmd.Stdout = &out + cmd.Env = append(os.Environ(), cmd.Env...) // needed for Windows + err := cmd.Run() + c.Hostname = strings.TrimSpace(out.String()) + if err != nil || c.Hostname == "" { + c.Hostname, err = fallbackHostnameFunc() + } + if c.Hostname == "" { + err = ErrMissingHostname + } + return err +} + +// Load returns a new configuration based on the given path. The path must not necessarily exist +// and a valid configuration can be returned based on defaults and environment variables. If a +// valid configuration can not be obtained, an error is returned. +func Load(path string) (*AgentConfig, error) { + cfg, err := prepareConfig(path) + if err != nil { + if !os.IsNotExist(err) { + return nil, err + } + } else { + log.Infof("Loaded configuration: %s", cfg.ConfigPath) + } + applyEnv() + cfg.applyDatadogConfig() + return cfg, cfg.validate() +} + +func prepareConfig(path string) (*AgentConfig, error) { + cfgPath := path + if cfgPath == flags.DefaultConfigPath && !osutil.Exists(cfgPath) && osutil.Exists(agent5Config) { + // attempting to load inexistent default path, but found existing Agent 5 + // legacy config - try using it + log.Warnf("Attempting to use Agent 5 configuration: %s", agent5Config) + cfgPath = agent5Config + } + cfg := New() + switch filepath.Ext(cfgPath) { + case ".ini", ".conf": + ac, err := legacy.GetAgentConfig(cfgPath) + if err != nil { + return cfg, err + } + if err := legacy.FromAgentConfig(ac); err != nil { + return cfg, err + } + case ".yaml": + config.Datadog.SetConfigFile(cfgPath) + if err := config.Load(); err != nil { + return cfg, err + } + cfg.DDAgentBin = defaultDDAgentBin + default: + return cfg, errors.New("unrecognised file extension (need .yaml, .ini or .conf)") + } + cfg.ConfigPath = cfgPath + return cfg, nil +} diff --git a/pkg/trace/config/config_nix.go b/pkg/trace/config/config_nix.go new file mode 100644 index 0000000000000..058ba33aacead --- /dev/null +++ b/pkg/trace/config/config_nix.go @@ -0,0 +1,20 @@ +// +build !windows + +package config + +const ( + // DefaultLogFilePath is where the agent will write logs if not overriden in the conf + DefaultLogFilePath = "/var/log/datadog/trace-agent.log" + + // Agent 5 Python Environment - exposes access to Python utilities + // such as obtaining the hostname from GCE, EC2, Kube, etc. + defaultDDAgentPy = "/opt/datadog-agent/embedded/bin/python" + defaultDDAgentPyEnv = "PYTHONPATH=/opt/datadog-agent/agent" + + // Agent 6 + defaultDDAgentBin = "/opt/datadog-agent/bin/agent/agent" +) + +// agent5Config points to the default agent 5 configuration path. It is used +// as a fallback when no configuration is set and the new default is missing. +const agent5Config = "/etc/dd-agent/datadog.conf" diff --git a/pkg/trace/config/config_test.go b/pkg/trace/config/config_test.go new file mode 100644 index 0000000000000..c8f30b1778dd7 --- /dev/null +++ b/pkg/trace/config/config_test.go @@ -0,0 +1,408 @@ +package config + +import ( + "os" + "regexp" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/config" + "github.com/DataDog/datadog-agent/pkg/trace/writer/backoff" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + "github.com/stretchr/testify/assert" +) + +func cleanConfig() func() { + oldConfig := config.Datadog + config.Datadog = config.NewConfig("datadog", "DD", strings.NewReplacer(".", "_")) + return func() { config.Datadog = oldConfig } +} + +func TestConfigHostname(t *testing.T) { + t.Run("nothing", func(t *testing.T) { + defer cleanConfig()() + assert := assert.New(t) + fallbackHostnameFunc = func() (string, error) { + return "", nil + } + defer func() { + fallbackHostnameFunc = os.Hostname + }() + _, err := Load("./testdata/multi_api_keys.ini") + assert.Equal(ErrMissingHostname, err) + }) + + t.Run("fallback", func(t *testing.T) { + defer cleanConfig()() + host, err := os.Hostname() + if err != nil || host == "" { + // can't say + t.Skip() + } + assert := assert.New(t) + cfg, err := Load("./testdata/multi_api_keys.ini") + assert.NoError(err) + assert.Equal(host, cfg.Hostname) + }) + + t.Run("file", func(t *testing.T) { + defer cleanConfig()() + assert := assert.New(t) + cfg, err := Load("./testdata/full.yaml") + assert.NoError(err) + assert.Equal("mymachine", cfg.Hostname) + }) + + t.Run("env", func(t *testing.T) { + defer cleanConfig()() + // hostname from env + assert := assert.New(t) + err := os.Setenv("DD_HOSTNAME", "onlyenv") + defer os.Unsetenv("DD_HOSTNAME") + assert.NoError(err) + cfg, err := Load("./testdata/multi_api_keys.ini") + assert.NoError(err) + assert.Equal("onlyenv", cfg.Hostname) + }) + + t.Run("file+env", func(t *testing.T) { + defer cleanConfig()() + // hostname from file, overwritten from env + assert := assert.New(t) + err := os.Setenv("DD_HOSTNAME", "envoverride") + defer os.Unsetenv("DD_HOSTNAME") + assert.NoError(err) + cfg, err := Load("./testdata/full.yaml") + assert.NoError(err) + assert.Equal("envoverride", cfg.Hostname) + }) +} + +func TestSite(t *testing.T) { + for name, tt := range map[string]struct { + file string + url string + }{ + "default": {"./testdata/site_default.yaml", "https://trace.agent.datadoghq.com"}, + "eu": {"./testdata/site_eu.yaml", "https://trace.agent.datadoghq.eu"}, + "url": {"./testdata/site_url.yaml", "some.other.datadoghq.eu"}, + "override": {"./testdata/site_override.yaml", "some.other.datadoghq.eu"}, + } { + t.Run(name, func(t *testing.T) { + defer cleanConfig()() + cfg, err := Load(tt.file) + assert.NoError(t, err) + assert.Equal(t, tt.url, cfg.Endpoints[0].Host) + }) + } +} + +func TestDefaultConfig(t *testing.T) { + assert := assert.New(t) + c := New() + + // assert that some sane defaults are set + assert.Equal("localhost", c.ReceiverHost) + assert.Equal(8126, c.ReceiverPort) + + assert.Equal("localhost", c.StatsdHost) + assert.Equal(8125, c.StatsdPort) + + assert.Equal("INFO", c.LogLevel) + assert.Equal(true, c.Enabled) + +} + +func TestOnlyDDAgentConfig(t *testing.T) { + defer cleanConfig()() + assert := assert.New(t) + + c, err := prepareConfig("./testdata/no_apm_config.ini") + assert.NoError(err) + assert.NoError(c.applyDatadogConfig()) + + assert.Equal("thing", c.Hostname) + assert.Equal("apikey_12", c.Endpoints[0].APIKey) + assert.Equal("0.0.0.0", c.ReceiverHost) + assert.Equal(28125, c.StatsdPort) + assert.Equal("DEBUG", c.LogLevel) +} + +func TestDDAgentMultiAPIKeys(t *testing.T) { + defer cleanConfig()() + // old feature Datadog Agent feature, got dropped since + // TODO: at some point, expire this case + assert := assert.New(t) + + c, err := prepareConfig("./testdata/multi_api_keys.ini") + assert.NoError(err) + assert.NoError(c.applyDatadogConfig()) + + assert.Equal("foo", c.Endpoints[0].APIKey) +} + +func TestFullIniConfig(t *testing.T) { + defer cleanConfig()() + assert := assert.New(t) + + c, err := prepareConfig("./testdata/full.ini") + assert.NoError(err) + assert.NoError(c.applyDatadogConfig()) + + assert.Equal("api_key_test", c.Endpoints[0].APIKey) + assert.Equal("mymachine", c.Hostname) + assert.Equal("https://user:password@proxy_for_https:1234", c.ProxyURL.String()) + assert.Equal("https://datadog.unittests", c.Endpoints[0].Host) + assert.Equal(false, c.Enabled) + assert.Equal("test", c.DefaultEnv) + assert.Equal(18126, c.ReceiverPort) + assert.Equal(18125, c.StatsdPort) + assert.Equal(0.5, c.ExtraSampleRate) + assert.Equal(5.0, c.MaxTPS) + assert.Equal(50.0, c.MaxEPS) + assert.Equal("0.0.0.0", c.ReceiverHost) + assert.Equal("host.ip", c.StatsdHost) + assert.Equal("/path/to/file", c.LogFilePath) + assert.Equal("debug", c.LogLevel) + assert.False(c.LogThrottlingEnabled) + assert.True(c.SkipSSLValidation) + + assert.Equal(map[string]float64{ + "service1": 1.1, + "service2": 1.2, + }, c.AnalyzedRateByServiceLegacy) + + assert.Equal(map[string]map[string]float64{ + "service3": map[string]float64{ + "op3": 1.3, + }, + "service4": map[string]float64{ + "op4": 1.4, + "op5": 1.5, + }, + }, c.AnalyzedSpansByService) + + assert.Equal(5*time.Second, c.BucketInterval) + assert.Equal([]string{"http.status_code", "a", "b", "c"}, c.ExtraAggregators) + assert.Equal(2000, c.ConnectionLimit) + assert.Equal(4, c.ReceiverTimeout) + assert.Equal(1234.5, c.MaxMemory) + assert.Equal(.85, c.MaxCPU) + assert.Equal(40, c.MaxConnections) + assert.Equal(5*time.Second, c.WatchdogInterval) + assert.EqualValues([]string{"/health", "/500"}, c.Ignore["resource"]) + + assert.Equal(writerconfig.ServiceWriterConfig{ + FlushPeriod: time.Second, + UpdateInfoPeriod: time.Second, + SenderConfig: writerconfig.QueuablePayloadSenderConf{ + MaxAge: time.Second, + MaxQueuedBytes: 456, + MaxQueuedPayloads: 4, + ExponentialBackoff: backoff.ExponentialConfig{ + MaxDuration: 4 * time.Second, + GrowthBase: 2, + Base: 1000000, + }, + }, + }, c.ServiceWriterConfig) + + assert.Equal(writerconfig.StatsWriterConfig{ + MaxEntriesPerPayload: 10, + UpdateInfoPeriod: 2 * time.Second, + SenderConfig: writerconfig.QueuablePayloadSenderConf{ + MaxAge: time.Second, + MaxQueuedBytes: 456, + MaxQueuedPayloads: 4, + ExponentialBackoff: backoff.ExponentialConfig{ + MaxDuration: 4 * time.Second, + GrowthBase: 2, + Base: 1000000, + }, + }, + }, c.StatsWriterConfig) + + assert.Equal(writerconfig.TraceWriterConfig{ + MaxSpansPerPayload: 100, + FlushPeriod: 3 * time.Second, + UpdateInfoPeriod: 2 * time.Second, + SenderConfig: writerconfig.QueuablePayloadSenderConf{ + MaxAge: time.Second, + MaxQueuedBytes: 456, + MaxQueuedPayloads: 4, + ExponentialBackoff: backoff.ExponentialConfig{ + MaxDuration: 4 * time.Second, + GrowthBase: 2, + Base: 1000000, + }, + }, + }, c.TraceWriterConfig) +} + +func TestFullYamlConfig(t *testing.T) { + defer cleanConfig()() + origcfg := config.Datadog + config.Datadog = config.NewConfig("datadog", "DD", strings.NewReplacer(".", "_")) + defer func() { + config.Datadog = origcfg + }() + + assert := assert.New(t) + + c, err := prepareConfig("./testdata/full.yaml") + assert.NoError(err) + assert.NoError(c.applyDatadogConfig()) + + assert.Equal("mymachine", c.Hostname) + assert.Equal("https://user:password@proxy_for_https:1234", c.ProxyURL.String()) + assert.True(c.SkipSSLValidation) + assert.Equal("debug", c.LogLevel) + assert.Equal(18125, c.StatsdPort) + assert.False(c.Enabled) + assert.Equal("abc", c.LogFilePath) + assert.Equal("test", c.DefaultEnv) + assert.Equal(123, c.ConnectionLimit) + assert.Equal(18126, c.ReceiverPort) + assert.Equal(0.5, c.ExtraSampleRate) + assert.Equal(5.0, c.MaxTPS) + assert.Equal(50.0, c.MaxEPS) + assert.Equal(0.005, c.MaxCPU) + assert.EqualValues(123.4, c.MaxMemory) + assert.Equal(12, c.MaxConnections) + assert.Equal("0.0.0.0", c.ReceiverHost) + + noProxy := true + if _, ok := os.LookupEnv("NO_PROXY"); ok { + // Happens in CircleCI: if the enviornment variable is set, + // it will overwrite our loaded configuration and will cause + // this test to fail. + noProxy = false + } + + assert.ElementsMatch([]*Endpoint{ + {Host: "https://datadog.unittests", APIKey: "api_key_test"}, + {Host: "https://my1.endpoint.com", APIKey: "apikey1"}, + {Host: "https://my1.endpoint.com", APIKey: "apikey2"}, + {Host: "https://my2.endpoint.eu", APIKey: "apikey3", NoProxy: noProxy}, + }, c.Endpoints) + + assert.ElementsMatch([]*ReplaceRule{ + { + Name: "http.method", + Pattern: "\\?.*$", + Repl: "GET", + Re: regexp.MustCompile("\\?.*$"), + }, + { + Name: "http.url", + Pattern: "\\?.*$", + Repl: "!", + Re: regexp.MustCompile("\\?.*$"), + }, + { + Name: "error.stack", + Pattern: "(?s).*", + Repl: "?", + Re: regexp.MustCompile("(?s).*"), + }, + }, c.ReplaceTags) + + assert.EqualValues([]string{"/health", "/500"}, c.Ignore["resource"]) + + o := c.Obfuscation + assert.NotNil(o) + assert.True(o.ES.Enabled) + assert.EqualValues([]string{"user_id", "category_id"}, o.ES.KeepValues) + assert.True(o.Mongo.Enabled) + assert.EqualValues([]string{"uid", "cat_id"}, o.Mongo.KeepValues) + assert.True(o.HTTP.RemoveQueryString) + assert.True(o.HTTP.RemovePathDigits) + assert.True(o.RemoveStackTraces) + assert.True(c.Obfuscation.Redis.Enabled) + assert.True(c.Obfuscation.Memcached.Enabled) +} + +func TestUndocumentedYamlConfig(t *testing.T) { + defer cleanConfig()() + origcfg := config.Datadog + config.Datadog = config.NewConfig("datadog", "DD", strings.NewReplacer(".", "_")) + defer func() { + config.Datadog = origcfg + }() + assert := assert.New(t) + + c, err := prepareConfig("./testdata/undocumented.yaml") + assert.NoError(err) + assert.NoError(c.applyDatadogConfig()) + + assert.Equal("/path/to/bin", c.DDAgentBin) + assert.Equal("thing", c.Hostname) + assert.Equal("apikey_12", c.Endpoints[0].APIKey) + assert.Equal(0.33, c.ExtraSampleRate) + assert.Equal(100.0, c.MaxTPS) + assert.Equal(1000.0, c.MaxEPS) + assert.Equal(25, c.ReceiverPort) + // watchdog + assert.Equal(0.07, c.MaxCPU) + assert.Equal(30e6, c.MaxMemory) + assert.Equal(50, c.MaxConnections) + + // Assert Trace Writer + assert.Equal(11, c.TraceWriterConfig.MaxSpansPerPayload) + assert.Equal(22*time.Second, c.TraceWriterConfig.FlushPeriod) + assert.Equal(33*time.Second, c.TraceWriterConfig.UpdateInfoPeriod) + assert.Equal(15*time.Second, c.TraceWriterConfig.SenderConfig.MaxAge) + assert.Equal(int64(2048), c.TraceWriterConfig.SenderConfig.MaxQueuedBytes) + assert.Equal(100, c.TraceWriterConfig.SenderConfig.MaxQueuedPayloads) + // Assert Service Writer + assert.Equal(55*time.Second, c.ServiceWriterConfig.FlushPeriod) + assert.Equal(44*time.Second, c.ServiceWriterConfig.UpdateInfoPeriod) + assert.Equal(15*time.Second, c.ServiceWriterConfig.SenderConfig.MaxAge) + assert.Equal(int64(2048), c.ServiceWriterConfig.SenderConfig.MaxQueuedBytes) + assert.Equal(100, c.ServiceWriterConfig.SenderConfig.MaxQueuedPayloads) + // Assert Stats Writer + assert.Equal(66*time.Second, c.StatsWriterConfig.UpdateInfoPeriod) + assert.Equal(15*time.Second, c.StatsWriterConfig.SenderConfig.MaxAge) + assert.Equal(int64(2048), c.StatsWriterConfig.SenderConfig.MaxQueuedBytes) + assert.Equal(100, c.StatsWriterConfig.SenderConfig.MaxQueuedPayloads) + // analysis legacy + assert.Equal(1.0, c.AnalyzedRateByServiceLegacy["db"]) + assert.Equal(0.9, c.AnalyzedRateByServiceLegacy["web"]) + assert.Equal(0.5, c.AnalyzedRateByServiceLegacy["index"]) + // analysis + assert.Len(c.AnalyzedSpansByService, 2) + assert.Len(c.AnalyzedSpansByService["web"], 2) + assert.Len(c.AnalyzedSpansByService["db"], 1) + assert.Equal(0.8, c.AnalyzedSpansByService["web"]["request"]) + assert.Equal(0.9, c.AnalyzedSpansByService["web"]["django.request"]) + assert.Equal(0.05, c.AnalyzedSpansByService["db"]["intake"]) +} + +func TestAcquireHostname(t *testing.T) { + c := New() + err := c.acquireHostname() + assert.Nil(t, err) + host, _ := os.Hostname() + assert.Equal(t, host, c.Hostname) +} + +func TestUndocumentedIni(t *testing.T) { + defer cleanConfig()() + assert := assert.New(t) + + c, err := prepareConfig("./testdata/undocumented.ini") + assert.NoError(err) + assert.NoError(c.applyDatadogConfig()) + + // analysis legacy + assert.Equal(0.8, c.AnalyzedRateByServiceLegacy["web"]) + assert.Equal(0.05, c.AnalyzedRateByServiceLegacy["intake"]) + // analysis + assert.Len(c.AnalyzedSpansByService, 2) + assert.Len(c.AnalyzedSpansByService["web"], 2) + assert.Len(c.AnalyzedSpansByService["db"], 1) + assert.Equal(0.8, c.AnalyzedSpansByService["web"]["http.request"]) + assert.Equal(0.9, c.AnalyzedSpansByService["web"]["django.request"]) + assert.Equal(0.05, c.AnalyzedSpansByService["db"]["intake"]) +} diff --git a/pkg/trace/config/config_windows.go b/pkg/trace/config/config_windows.go new file mode 100644 index 0000000000000..556956b973d7a --- /dev/null +++ b/pkg/trace/config/config_windows.go @@ -0,0 +1,37 @@ +package config + +import ( + "path/filepath" + + "github.com/DataDog/datadog-agent/pkg/util/executable" + "github.com/DataDog/datadog-agent/pkg/util/winutil" +) + +var ( + // DefaultLogFilePath is where the agent will write logs if not overriden in the conf + DefaultLogFilePath = "c:\\programdata\\datadog\\logs\\trace-agent.log" + + // Agent 5 Python Environment - exposes access to Python utilities + // such as obtaining the hostname from GCE, EC2, Kube, etc. + defaultDDAgentPy = "c:\\Program Files\\Datadog\\Datadog Agent\\embedded\\python.exe" + defaultDDAgentPyEnv = "PYTHONPATH=c:\\Program Files\\Datadog\\Datadog Agent\\agent" + + // Agent 6 + defaultDDAgentBin = "c:\\Program Files\\Datadog\\Datadog Agent\\embedded\\agent.exe" +) + +// agent5Config points to the default agent 5 configuration path. It is used +// as a fallback when no configuration is set and the new default is missing. +const agent5Config = "c:\\programdata\\datadog\\datadog.conf" + +func init() { + pd, err := winutil.GetProgramDataDir() + if err == nil { + DefaultLogFilePath = filepath.Join(pd, "Datadog", "logs", "trace-agent.log") + } + _here, err := executable.Folder() + if err == nil { + defaultDDAgentBin = filepath.Join(_here, "..", "..", "embedded", "agent.exe") + } + +} diff --git a/pkg/trace/config/env.go b/pkg/trace/config/env.go new file mode 100644 index 0000000000000..6b30943cb1b3c --- /dev/null +++ b/pkg/trace/config/env.go @@ -0,0 +1,96 @@ +package config + +import ( + "fmt" + "os" + "strconv" + "strings" + + "github.com/DataDog/datadog-agent/pkg/config" + log "github.com/cihub/seelog" +) + +func applyEnv() { + // Warning: do not use BindEnv to bind config variables. They will be overriden + // when using the legacy config loader. + for _, override := range []struct{ env, key string }{ + // Core agent: + {"DD_SITE", "site"}, + {"DD_API_KEY", "api_key"}, + {"DD_HOSTNAME", "hostname"}, + {"DD_BIND_HOST", "bind_host"}, + {"DD_DOGSTATSD_PORT", "dogstatsd_port"}, + {"DD_LOG_LEVEL", "log_level"}, + {"HTTPS_PROXY", "proxy.https"}, // deprecated + {"DD_PROXY_HTTPS", "proxy.https"}, + + // APM specific: + {"DD_CONNECTION_LIMIT", "apm_config.connection_limit"}, // deprecated + {"DD_APM_CONNECTION_LIMIT", "apm_config.connection_limit"}, + {"DD_APM_ENABLED", "apm_config.enabled"}, + {"DD_APM_ENV", "apm_config.env"}, + {"DD_APM_NON_LOCAL_TRAFFIC", "apm_config.apm_non_local_traffic"}, + {"DD_APM_DD_URL", "apm_config.apm_dd_url"}, + {"DD_RECEIVER_PORT", "apm_config.receiver_port"}, // deprecated + {"DD_APM_RECEIVER_PORT", "apm_config.receiver_port"}, + {"DD_MAX_EPS", "apm_config.max_events_per_second"}, // deprecated + {"DD_APM_MAX_EPS", "apm_config.max_events_per_second"}, + {"DD_MAX_TPS", "apm_config.max_traces_per_second"}, // deprecated + {"DD_APM_MAX_TPS", "apm_config.max_traces_per_second"}, + {"DD_APM_MAX_MEMORY", "apm_config.max_memory"}, + } { + if v := os.Getenv(override.env); v != "" { + config.Datadog.Set(override.key, v) + } + } + for _, envKey := range []string{ + "DD_IGNORE_RESOURCE", // deprecated + "DD_APM_IGNORE_RESOURCES", + } { + if v := os.Getenv(envKey); v != "" { + if r, err := splitString(v, ','); err != nil { + log.Warn("%q value not loaded: %v", envKey, err) + } else { + config.Datadog.Set("apm_config.ignore_resources", r) + } + } + } + if v := os.Getenv("DD_APM_ANALYZED_SPANS"); v != "" { + analyzedSpans, err := parseAnalyzedSpans(v) + if err == nil { + config.Datadog.Set("apm_config.analyzed_spans", analyzedSpans) + } else { + log.Errorf("Bad format for %s it should be of the form \"service_name|operation_name=rate,other_service|other_operation=rate\", error: %v", "DD_APM_ANALYZED_SPANS", err) + } + } +} + +func parseNameAndRate(token string) (string, float64, error) { + parts := strings.Split(token, "=") + if len(parts) != 2 { + return "", 0, fmt.Errorf("Bad format") + } + rate, err := strconv.ParseFloat(parts[1], 64) + if err != nil { + return "", 0, fmt.Errorf("Unabled to parse rate") + } + return parts[0], rate, nil +} + +// parseAnalyzedSpans parses the env string to extract a map of spans to be analyzed by service and operation. +// the format is: service_name|operation_name=rate,other_service|other_operation=rate +func parseAnalyzedSpans(env string) (analyzedSpans map[string]float64, err error) { + analyzedSpans = make(map[string]float64) + if env == "" { + return + } + tokens := strings.Split(env, ",") + for _, token := range tokens { + name, rate, err := parseNameAndRate(token) + if err != nil { + return nil, err + } + analyzedSpans[name] = rate + } + return +} diff --git a/pkg/trace/config/env_test.go b/pkg/trace/config/env_test.go new file mode 100644 index 0000000000000..1235a50da9b37 --- /dev/null +++ b/pkg/trace/config/env_test.go @@ -0,0 +1,260 @@ +package config + +import ( + "os" + "testing" + + "github.com/DataDog/datadog-agent/pkg/config" + log "github.com/cihub/seelog" + "github.com/stretchr/testify/assert" +) + +func TestMain(m *testing.M) { + log.UseLogger(log.Disabled) + os.Exit(m.Run()) +} + +func TestLoadEnv(t *testing.T) { + t.Run("overrides", func(t *testing.T) { + // tests that newer envs. override deprecated ones + for _, tt := range []struct { + envOld, envNew, key string + }{ + {"HTTPS_PROXY", "DD_PROXY_HTTPS", "proxy.https"}, + {"DD_CONNECTION_LIMIT", "DD_APM_CONNECTION_LIMIT", "apm_config.connection_limit"}, + {"DD_RECEIVER_PORT", "DD_APM_RECEIVER_PORT", "apm_config.receiver_port"}, + {"DD_MAX_EPS", "DD_MAX_EPS", "apm_config.max_events_per_second"}, + {"DD_MAX_TPS", "DD_APM_MAX_TPS", "apm_config.max_traces_per_second"}, + {"DD_IGNORE_RESOURCE", "DD_APM_IGNORE_RESOURCES", "apm_config.ignore_resources"}, + } { + assert := assert.New(t) + err := os.Setenv(tt.envOld, "1,2,3") + assert.NoError(err) + defer os.Unsetenv(tt.envOld) + err = os.Setenv(tt.envNew, "4,5,6") + assert.NoError(err) + defer os.Unsetenv(tt.envNew) + _, err = Load("./testdata/full.yaml") + assert.NoError(err) + if tt.envNew == "DD_APM_IGNORE_RESOURCES" { + assert.Equal([]string{"4", "5", "6"}, config.Datadog.GetStringSlice(tt.key)) + } else { + assert.Equal("4,5,6", config.Datadog.Get(tt.key)) + } + } + }) + + for _, ext := range []string{"yaml", "ini"} { + t.Run(ext, func(t *testing.T) { + env := "DD_API_KEY" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "123") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("123", cfg.Endpoints[0].APIKey) + }) + + env = "DD_SITE" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "my-site.com") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/undocumented." + ext) + assert.NoError(err) + assert.Equal(apiEndpointPrefix+"my-site.com", cfg.Endpoints[0].Host) + }) + + env = "DD_APM_ENABLED" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "true") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.True(cfg.Enabled) + }) + + env = "DD_APM_DD_URL" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "my-site.com") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("my-site.com", cfg.Endpoints[0].Host) + }) + + env = "HTTPS_PROXY" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "my-proxy.url") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("my-proxy.url", cfg.ProxyURL.String()) + }) + + env = "DD_PROXY_HTTPS" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "my-proxy.url") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("my-proxy.url", cfg.ProxyURL.String()) + }) + + env = "DD_HOSTNAME" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "local.host") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("local.host", cfg.Hostname) + }) + + env = "DD_BIND_HOST" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "bindhost.com") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("bindhost.com", cfg.StatsdHost) + }) + + for _, envKey := range []string{ + "DD_RECEIVER_PORT", // deprecated + "DD_APM_RECEIVER_PORT", + } { + t.Run(envKey, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(envKey, "1234") + assert.NoError(err) + defer os.Unsetenv(envKey) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal(1234, cfg.ReceiverPort) + }) + } + + env = "DD_DOGSTATSD_PORT" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "4321") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal(4321, cfg.StatsdPort) + }) + + env = "DD_APM_NON_LOCAL_TRAFFIC" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "true") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/undocumented." + ext) + assert.NoError(err) + assert.Equal("0.0.0.0", cfg.ReceiverHost) + }) + + for _, envKey := range []string{ + "DD_IGNORE_RESOURCE", // deprecated + "DD_APM_IGNORE_RESOURCES", + } { + t.Run(envKey, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(envKey, "1,2,3") + assert.NoError(err) + defer os.Unsetenv(envKey) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal([]string{"1", "2", "3"}, cfg.Ignore["resource"]) + }) + } + + env = "DD_LOG_LEVEL" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "warn") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal("warn", cfg.LogLevel) + }) + + env = "DD_APM_ANALYZED_SPANS" + t.Run(env, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(env, "web|http.request=1,db|sql.query=0.5") + assert.NoError(err) + defer os.Unsetenv(env) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal(map[string]map[string]float64{ + "web": map[string]float64{"http.request": 1}, + "db": map[string]float64{"sql.query": 0.5}, + }, cfg.AnalyzedSpansByService) + }) + + for _, envKey := range []string{ + "DD_CONNECTION_LIMIT", // deprecated + "DD_APM_CONNECTION_LIMIT", + } { + t.Run(envKey, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(envKey, "50") + assert.NoError(err) + defer os.Unsetenv(envKey) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal(50, cfg.ConnectionLimit) + }) + } + + for _, envKey := range []string{ + "DD_MAX_TPS", // deprecated + "DD_APM_MAX_TPS", + } { + t.Run(envKey, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(envKey, "6") + assert.NoError(err) + defer os.Unsetenv(envKey) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal(6., cfg.MaxTPS) + }) + } + + for _, envKey := range []string{ + "DD_MAX_EPS", // deprecated + "DD_APM_MAX_EPS", + } { + t.Run(envKey, func(t *testing.T) { + assert := assert.New(t) + err := os.Setenv(envKey, "7") + assert.NoError(err) + defer os.Unsetenv(envKey) + cfg, err := Load("./testdata/full." + ext) + assert.NoError(err) + assert.Equal(7., cfg.MaxEPS) + }) + } + }) + } +} diff --git a/pkg/trace/config/testdata/full.ini b/pkg/trace/config/testdata/full.ini new file mode 100644 index 0000000000000..a4ff15d31d619 --- /dev/null +++ b/pkg/trace/config/testdata/full.ini @@ -0,0 +1,85 @@ +[Main] +dd_url: https://app.datadoghq.com +api_key: api_key_test +hostname: mymachine +proxy_host: https://proxy_for_https +proxy_port: 1234 +proxy_user: user +proxy_password: password +dogstatsd_port: 18125 +non_local_traffic: yes +log_level: debug +apm_enabled: false +bind_host: host.ip +skip_ssl_validation: yes + +[trace.api] +endpoint: https://datadog.unittests + +[trace.config] +env: test +log_file: /path/to/file +log_throttling: no + +[trace.sampler] +extra_sample_rate: 0.5 +max_traces_per_second: 5 +max_events_per_second: 50 + +[trace.ignore] +resource: "/health","/500" + +[trace.analyzed_rate_by_service] +service1: 1.1 +service2: 1.2 + +[trace.analyzed_spans] +service3|op3: 1.3 +service4|op4: 1.4 +service4|op5: 1.5 + +[trace.concentrator] +bucket_size_seconds: 5 +extra_aggregators: a,b,c + +[trace.receiver] +receiver_port: 18126 +connection_limit: 2000 +timeout: 4 + +[trace.watchdog] +max_memory: 1234.5 +max_cpu_percent: 85 +max_connections: 40 +check_delay_seconds: 5 + +[trace.writer.services] +flush_period_seconds: 1 +update_info_period_seconds: 1 +queue_max_age_seconds: 1 +queue_max_bytes: 456 +queue_max_payloads: 4 +exp_backoff_max_duration_seconds: 4 +exp_backoff_base_milliseconds: 1 +exp_backoff_growth_base: 2 + +[trace.writer.stats] +max_entries_per_payload: 10 +update_info_period_seconds: 2 +queue_max_age_seconds: 1 +queue_max_bytes: 456 +queue_max_payloads: 4 +exp_backoff_max_duration_seconds: 4 +exp_backoff_base_milliseconds: 1 +exp_backoff_growth_base: 2 + +[trace.writer.traces] +max_spans_per_payload: 100 +flush_period_seconds: 3 +update_info_period_seconds: 2 +queue_max_age_seconds: 1 +queue_max_bytes: 456 # 64-bit integer +queue_max_payloads: 4 +exp_backoff_max_duration_seconds: 4 +exp_backoff_base_milliseconds: 1 +exp_backoff_growth_base: 2 diff --git a/pkg/trace/config/testdata/full.yaml b/pkg/trace/config/testdata/full.yaml new file mode 100644 index 0000000000000..65ecc1b6ff7e3 --- /dev/null +++ b/pkg/trace/config/testdata/full.yaml @@ -0,0 +1,63 @@ +dd_url: https://app.datadoghq.com +api_key: api_key_test +hostname: mymachine +proxy: + https: https://user:password@proxy_for_https:1234 + no_proxy: + - https://my2.endpoint.eu +use_dogstatsd: yes +skip_ssl_validation: yes +dogstatsd_port: 18125 +dogstatsd_non_local_traffic: yes +log_level: debug +apm_config: + enabled: false + log_file: abc + apm_dd_url: https://datadog.unittests + max_cpu_percent: 0.5 + max_memory: 123.4 + max_connections: 12 + additional_endpoints: + https://my1.endpoint.com: + - apikey1 + - apikey2 + https://my2.endpoint.eu: + - apikey3 + env: test + receiver_port: 18126 + connection_limit: 123 + apm_non_local_traffic: yes + extra_sample_rate: 0.5 + max_traces_per_second: 5 + max_events_per_second: 50 + ignore_resources: + - /health + - /500 + + replace_tags: + - name: "http.method" + pattern: "\\?.*$" + repl: "GET" + - name: "http.url" + pattern: "\\?.*$" + repl: "!" + + obfuscation: + elasticsearch: + enabled: true + keep_values: + - user_id + - category_id + mongodb: + enabled: true + keep_values: + - uid + - cat_id + http: + remove_query_string: true + remove_paths_with_digits: true + remove_stack_traces: true + redis: + enabled: true + memcached: + enabled: true diff --git a/pkg/trace/config/testdata/multi_api_keys.ini b/pkg/trace/config/testdata/multi_api_keys.ini new file mode 100644 index 0000000000000..f641c8fabd794 --- /dev/null +++ b/pkg/trace/config/testdata/multi_api_keys.ini @@ -0,0 +1,3 @@ +[Main] +dd_url=url1, url2 +api_key=foo, bar diff --git a/pkg/trace/config/testdata/no_apm_config.ini b/pkg/trace/config/testdata/no_apm_config.ini new file mode 100644 index 0000000000000..e90cf8c5705ee --- /dev/null +++ b/pkg/trace/config/testdata/no_apm_config.ini @@ -0,0 +1,6 @@ +[Main] +hostname = thing +api_key = apikey_12 +bind_host = 0.0.0.0 +dogstatsd_port = 28125 +log_level = DEBUG diff --git a/pkg/trace/config/testdata/site_default.yaml b/pkg/trace/config/testdata/site_default.yaml new file mode 100644 index 0000000000000..b32e10cd8ce14 --- /dev/null +++ b/pkg/trace/config/testdata/site_default.yaml @@ -0,0 +1,2 @@ +api_key: api_key_test +dd_url: https://app.datadoghq.com diff --git a/pkg/trace/config/testdata/site_eu.yaml b/pkg/trace/config/testdata/site_eu.yaml new file mode 100644 index 0000000000000..622a0ce1d6879 --- /dev/null +++ b/pkg/trace/config/testdata/site_eu.yaml @@ -0,0 +1,3 @@ +api_key: api_key_test +dd_url: https://app.datadoghq.com +site: datadoghq.eu diff --git a/pkg/trace/config/testdata/site_override.yaml b/pkg/trace/config/testdata/site_override.yaml new file mode 100644 index 0000000000000..c6366b9e778e8 --- /dev/null +++ b/pkg/trace/config/testdata/site_override.yaml @@ -0,0 +1,5 @@ +api_key: api_key_test +dd_url: https://app.datadoghq.com +site: datadoghq.br +apm_config: + apm_dd_url: some.other.datadoghq.eu diff --git a/pkg/trace/config/testdata/site_url.yaml b/pkg/trace/config/testdata/site_url.yaml new file mode 100644 index 0000000000000..c4b1ff479f8c4 --- /dev/null +++ b/pkg/trace/config/testdata/site_url.yaml @@ -0,0 +1,4 @@ +api_key: api_key_test +dd_url: https://app.datadoghq.com +apm_config: + apm_dd_url: some.other.datadoghq.eu diff --git a/pkg/trace/config/testdata/undocumented.ini b/pkg/trace/config/testdata/undocumented.ini new file mode 100644 index 0000000000000..2896621a46225 --- /dev/null +++ b/pkg/trace/config/testdata/undocumented.ini @@ -0,0 +1,11 @@ +[Main] +api_key: any + +[trace.analyzed_rate_by_service] +web: 0.8 +intake: 0.05 + +[trace.analyzed_spans] +web|http.request: 0.8 +web|django.request: 0.9 +db|intake: 0.05 diff --git a/pkg/trace/config/testdata/undocumented.yaml b/pkg/trace/config/testdata/undocumented.yaml new file mode 100644 index 0000000000000..12cdc763f3dcf --- /dev/null +++ b/pkg/trace/config/testdata/undocumented.yaml @@ -0,0 +1,41 @@ +api_key: apikey_12 +hostname: thing +apm_config: + extra_sample_rate: 0.33 + dd_agent_bin: /path/to/bin + max_traces_per_second: 100.0 + max_events_per_second: 1000.0 + receiver_port: 25 + max_cpu_percent: 7 + max_connections: 50 + max_memory: 30000000 + trace_writer: + max_spans_per_payload: 11 + flush_period_seconds: 22 + update_info_period_seconds: 33 + queue: + max_age_seconds: 15 + max_bytes: 2048 + max_payloads: 100 + service_writer: + update_info_period_seconds: 44 + flush_period_seconds: 55 + queue: + max_age_seconds: 15 + max_bytes: 2048 + max_payloads: 100 + stats_writer: + update_info_period_seconds: 66 + queue: + max_age_seconds: 15 + max_bytes: 2048 + max_payloads: 100 + analyzed_rate_by_service: + db: 1 + web: 0.9 + index: 0.5 + analyzed_spans: + web|request: 0.8 + web|django.request: 0.9 + db|intake: 0.05 + bad_format: 0.5 diff --git a/pkg/trace/event/doc.go b/pkg/trace/event/doc.go new file mode 100644 index 0000000000000..8035b7dfb8e57 --- /dev/null +++ b/pkg/trace/event/doc.go @@ -0,0 +1,16 @@ +// Package event contains functionality related to APM event extraction from traces. +// +// APM Events constitute the core of Datadog's Trace Search functionality. These are, in a nutshell, individual spans +// containing important information (but not full trace tree) about an execution and which can therefore be sampled at a +// different rate (retaining greater cardinality than that of complete traces). Furthermore, all information in APM events +// can be indexed, allowing for very flexible searching. +// +// For instance, consider a web server. The top-level span on traces from this web server likely contains interesting +// things such as customer/user id, IPs, HTTP tags, HTTP endpoint, among others. By extracting this top level span from +// each trace, converting it into an APM event and feeding it into trace search, you can potentially search and aggregate +// this information for all requests arriving at your web server. You couldn't do the same thing with traces because these +// capture entire execution trees which are much more expensive to process and store and are therefore heavily sampled. +// +// Of course, if the trace from which APM events were extracted also survives sampling, you can easily see the execution +// tree associated with a particular APM event as this link is kept throughout the entire processing pipeline. +package event diff --git a/pkg/trace/event/extractor.go b/pkg/trace/event/extractor.go new file mode 100644 index 0000000000000..4d560924dd2b8 --- /dev/null +++ b/pkg/trace/event/extractor.go @@ -0,0 +1,14 @@ +package event + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// Extractor extracts APM events from matching spans. +type Extractor interface { + // Extract decides whether to extract an APM event from the provided span with the specified priority and returns + // a suggested extraction sample rate and a bool value. If no event was extracted the bool value will be false and + // the rate should not be used. + Extract(span *agent.WeightedSpan, priority sampler.SamplingPriority) (rate float64, ok bool) +} diff --git a/pkg/trace/event/extractor_fixed_rate.go b/pkg/trace/event/extractor_fixed_rate.go new file mode 100644 index 0000000000000..eb7b00b5a58e2 --- /dev/null +++ b/pkg/trace/event/extractor_fixed_rate.go @@ -0,0 +1,40 @@ +package event + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// fixedRateExtractor is an event extractor that decides whether to extract APM events from spans based on +// `(service name, operation name) => sampling rate` mappings. +type fixedRateExtractor struct { + rateByServiceAndName map[string]map[string]float64 +} + +// NewFixedRateExtractor returns an APM event extractor that decides whether to extract APM events from spans following +// the provided extraction rates for a span's (service name, operation name) pair. +func NewFixedRateExtractor(rateByServiceAndName map[string]map[string]float64) Extractor { + return &fixedRateExtractor{ + rateByServiceAndName: rateByServiceAndName, + } +} + +// Extract decides to extract an apm event from a span if its service and name have a corresponding extraction rate +// on the rateByServiceAndName map passed in the constructor. The extracted event is returned along with the associated +// extraction rate and a true value. If no extraction happened, false is returned as the third value and the others +// are invalid. +func (e *fixedRateExtractor) Extract(s *agent.WeightedSpan, priority sampler.SamplingPriority) (float64, bool) { + operations, ok := e.rateByServiceAndName[s.Service] + if !ok { + return 0, false + } + extractionRate, ok := operations[s.Name] + if !ok { + return 0, false + } + if extractionRate > 0 && priority >= sampler.PriorityUserKeep { + // If the span has been manually sampled, we always want to keep these events + extractionRate = 1 + } + return extractionRate, true +} diff --git a/pkg/trace/event/extractor_fixed_rate_test.go b/pkg/trace/event/extractor_fixed_rate_test.go new file mode 100644 index 0000000000000..26cd20f96782b --- /dev/null +++ b/pkg/trace/event/extractor_fixed_rate_test.go @@ -0,0 +1,52 @@ +package event + +import ( + "math/rand" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +func createTestSpans(serviceName string, operationName string) []*agent.WeightedSpan { + spans := make([]*agent.WeightedSpan, 1000) + for i, _ := range spans { + spans[i] = &agent.WeightedSpan{Span: &pb.Span{TraceID: rand.Uint64(), Service: serviceName, Name: operationName}} + } + return spans +} + +func TestAnalyzedExtractor(t *testing.T) { + config := make(map[string]map[string]float64) + config["serviceA"] = make(map[string]float64) + config["serviceA"]["opA"] = 0 + + config["serviceB"] = make(map[string]float64) + config["serviceB"]["opB"] = 0.5 + + config["serviceC"] = make(map[string]float64) + config["serviceC"]["opC"] = 1 + + tests := []extractorTestCase{ + // Name: /(/) + {"none/noservice", createTestSpans("serviceZ", "opA"), 0, -1}, + {"none/noname", createTestSpans("serviceA", "opZ"), 0, -1}, + {"none/0", createTestSpans("serviceA", "opA"), 0, 0}, + {"none/0.5", createTestSpans("serviceB", "opB"), 0, 0.5}, + {"none/1", createTestSpans("serviceC", "opC"), 0, 1}, + {"1/noservice", createTestSpans("serviceZ", "opA"), 1, -1}, + {"1/noname", createTestSpans("serviceA", "opZ"), 1, -1}, + {"1/0", createTestSpans("serviceA", "opA"), 1, 0}, + {"1/0.5", createTestSpans("serviceB", "opB"), 1, 0.5}, + {"1/1", createTestSpans("serviceC", "opC"), 1, 1}, + {"2/noservice", createTestSpans("serviceZ", "opA"), 2, -1}, + {"2/noname", createTestSpans("serviceA", "opZ"), 2, -1}, + {"2/0", createTestSpans("serviceA", "opA"), 2, 0}, + {"2/0.5", createTestSpans("serviceB", "opB"), 2, 1}, + {"2/1", createTestSpans("serviceC", "opC"), 2, 1}, + } + + for _, test := range tests { + testExtractor(t, NewFixedRateExtractor(config), test) + } +} diff --git a/pkg/trace/event/extractor_legacy.go b/pkg/trace/event/extractor_legacy.go new file mode 100644 index 0000000000000..48be03310c253 --- /dev/null +++ b/pkg/trace/event/extractor_legacy.go @@ -0,0 +1,35 @@ +package event + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// legacyExtractor is an event extractor that decides whether to extract APM events from spans based on +// `serviceName => sampling rate` mappings. +type legacyExtractor struct { + rateByService map[string]float64 +} + +// NewLegacyExtractor returns an APM event extractor that decides whether to extract APM events from spans following the +// specified extraction rates for a span's service. +func NewLegacyExtractor(rateByService map[string]float64) Extractor { + return &legacyExtractor{ + rateByService: rateByService, + } +} + +// Extract decides to extract an apm event from the provided span if there's an extraction rate configured for that +// span's service. In this case the extracted event is returned along with the found extraction rate and a true value. +// If this rate doesn't exist or the provided span is not a top level one, then no extraction is done and false is +// returned as the third value, with the others being invalid. +func (e *legacyExtractor) Extract(s *agent.WeightedSpan, priority sampler.SamplingPriority) (float64, bool) { + if !s.TopLevel { + return 0, false + } + extractionRate, ok := e.rateByService[s.Service] + if !ok { + return 0, false + } + return extractionRate, true +} diff --git a/pkg/trace/event/extractor_metric.go b/pkg/trace/event/extractor_metric.go new file mode 100644 index 0000000000000..e40357cad82d0 --- /dev/null +++ b/pkg/trace/event/extractor_metric.go @@ -0,0 +1,38 @@ +package event + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// metricBasedExtractor is an event extractor that decides whether to extract APM events from spans based on +// the value of the event extraction rate metric set on those spans. +type metricBasedExtractor struct{} + +// NewMetricBasedExtractor returns an APM event extractor that decides whether to extract APM events from spans based on +// the value of the event extraction rate metric set on those span. +func NewMetricBasedExtractor() Extractor { + return &metricBasedExtractor{} +} + +// Extract decides whether to extract APM events from a span based on the value of the event extraction rate metric set +// on that span. If such a value exists, the extracted event is returned along with this rate and a true value. +// Otherwise, false is returned as the third value and the others are invalid. +// +// NOTE: If priority is UserKeep (manually sampled) any extraction rate bigger than 0 is upscaled to 1 to ensure no +// extraction sampling is done on this event. +func (e *metricBasedExtractor) Extract(s *agent.WeightedSpan, priority sampler.SamplingPriority) (float64, bool) { + if len(s.Metrics) == 0 { + // metric not set + return 0, false + } + extractionRate, ok := s.Metrics[sampler.KeySamplingRateEventExtraction] + if !ok { + return 0, false + } + if extractionRate > 0 && priority >= sampler.PriorityUserKeep { + // If the trace has been manually sampled, we keep all matching spans + extractionRate = 1 + } + return extractionRate, true +} diff --git a/pkg/trace/event/extractor_metric_test.go b/pkg/trace/event/extractor_metric_test.go new file mode 100644 index 0000000000000..8629c8cf73cbe --- /dev/null +++ b/pkg/trace/event/extractor_metric_test.go @@ -0,0 +1,44 @@ +package event + +import ( + "math/rand" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +func createTestSpansWithEventRate(eventRate float64) []*agent.WeightedSpan { + spans := make([]*agent.WeightedSpan, 1000) + for i, _ := range spans { + spans[i] = &agent.WeightedSpan{Span: &pb.Span{TraceID: rand.Uint64(), Service: "test", Name: "test", Metrics: map[string]float64{}}} + if eventRate >= 0 { + spans[i].Metrics[sampler.KeySamplingRateEventExtraction] = eventRate + } + } + return spans +} + +func TestMetricBasedExtractor(t *testing.T) { + tests := []extractorTestCase{ + // Name: / + {"none/missing", createTestSpansWithEventRate(-1), 0, -1}, + {"none/0", createTestSpansWithEventRate(0), 0, 0}, + {"none/0.5", createTestSpansWithEventRate(0.5), 0, 0.5}, + {"none/1", createTestSpansWithEventRate(1), 0, 1}, + {"1/missing", createTestSpansWithEventRate(-1), 1, -1}, + {"1/0", createTestSpansWithEventRate(0), 1, 0}, + {"1/0.5", createTestSpansWithEventRate(0.5), 1, 0.5}, + {"1/1", createTestSpansWithEventRate(1), 1, 1}, + // Priority 2 should have extraction rate of 1 so long as any extraction rate is set and > 0 + {"2/missing", createTestSpansWithEventRate(-1), 2, -1}, + {"2/0", createTestSpansWithEventRate(0), 2, 0}, + {"2/0.5", createTestSpansWithEventRate(0.5), 2, 1}, + {"2/1", createTestSpansWithEventRate(1), 2, 1}, + } + + for _, test := range tests { + testExtractor(t, NewMetricBasedExtractor(), test) + } +} diff --git a/pkg/trace/event/extractor_noop.go b/pkg/trace/event/extractor_noop.go new file mode 100644 index 0000000000000..1f3d1d492c868 --- /dev/null +++ b/pkg/trace/event/extractor_noop.go @@ -0,0 +1,18 @@ +package event + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// noopExtractor is a no-op APM event extractor used when APM event extraction is disabled. +type noopExtractor struct{} + +// NewNoopExtractor returns a new APM event extractor that does not extract any events. +func NewNoopExtractor() Extractor { + return &noopExtractor{} +} + +func (e *noopExtractor) Extract(_ *agent.WeightedSpan, _ sampler.SamplingPriority) (float64, bool) { + return 0, false +} diff --git a/pkg/trace/event/extractor_test.go b/pkg/trace/event/extractor_test.go new file mode 100644 index 0000000000000..afca60cb7cfee --- /dev/null +++ b/pkg/trace/event/extractor_test.go @@ -0,0 +1,36 @@ +package event + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/stretchr/testify/assert" +) + +type extractorTestCase struct { + name string + spans []*agent.WeightedSpan + priority sampler.SamplingPriority + expectedExtractionRate float64 +} + +func testExtractor(t *testing.T, extractor Extractor, testCase extractorTestCase) { + t.Run(testCase.name, func(t *testing.T) { + assert := assert.New(t) + + total := 0 + + for _, span := range testCase.spans { + rate, ok := extractor.Extract(span, testCase.priority) + + total++ + + if !ok { + rate = -1 + } + + assert.EqualValues(testCase.expectedExtractionRate, rate) + } + }) +} diff --git a/pkg/trace/event/processor.go b/pkg/trace/event/processor.go new file mode 100644 index 0000000000000..ad34fecb0a9cf --- /dev/null +++ b/pkg/trace/event/processor.go @@ -0,0 +1,120 @@ +package event + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// Processor is responsible for all the logic surrounding extraction and sampling of APM events from processed traces. +type Processor struct { + extractors []Extractor + maxEPSSampler eventSampler +} + +// NewProcessor returns a new instance of Processor configured with the provided extractors and max eps limitation. +// +// Extractors will look at each span in the trace and decide whether it should be converted to an APM event or not. They +// will be tried in the provided order, with the first one returning an event stopping the chain. +// +// All extracted APM events are then submitted to sampling. This sampling is 2-fold: +// * A first sampling step is done based on the extraction sampling rate returned by an Extractor. If an Extractor +// returns an event accompanied with a 0.1 extraction rate, then there's a 90% chance that this event will get +// discarded. +// * A max events per second maxEPSSampler is applied to all non-PriorityUserKeep events that survived the first step +// and will ensure that, in average, the total rate of events returned by the processor is not bigger than maxEPS. +func NewProcessor(extractors []Extractor, maxEPS float64) *Processor { + return newProcessor(extractors, newMaxEPSSampler(maxEPS)) +} + +func newProcessor(extractors []Extractor, maxEPSSampler eventSampler) *Processor { + return &Processor{ + extractors: extractors, + maxEPSSampler: maxEPSSampler, + } +} + +// Start starts the processor. +func (p *Processor) Start() { + p.maxEPSSampler.Start() +} + +// Stop stops the processor. +func (p *Processor) Stop() { + p.maxEPSSampler.Stop() +} + +// Process takes a processed trace, extracts events from it and samples them, returning a collection of +// sampled events along with the total count of extracted events. +func (p *Processor) Process(t agent.ProcessedTrace) (events []*pb.Span, numExtracted int64) { + if len(p.extractors) == 0 { + return + } + + priority, hasPriority := t.GetSamplingPriority() + if !hasPriority { + priority = sampler.PriorityNone + } + + clientSampleRate := sampler.GetClientRate(t.Root) + preSampleRate := sampler.GetPreSampleRate(t.Root) + + for _, wspan := range t.WeightedTrace { + extractionRate, ok := p.extract(wspan, priority) + if !ok { + continue + } + + event := wspan.Span + sampled := p.extractionSample(event, extractionRate) + if !sampled { + continue + } + numExtracted++ + + sampled, epsRate := p.maxEPSSample(event, priority) + if !sampled { + continue + } + + // This event got sampled, so add it to results + events = append(events, event) + // And set whatever rates had been set on the trace initially + sampler.SetClientRate(event, clientSampleRate) + sampler.SetPreSampleRate(event, preSampleRate) + // As well as the rates of sampling done during this processing + sampler.SetEventExtractionRate(event, extractionRate) + sampler.SetMaxEPSRate(event, epsRate) + if hasPriority { + sampler.SetSamplingPriority(event, priority) + } + } + + return +} + +func (p *Processor) extract(span *agent.WeightedSpan, priority sampler.SamplingPriority) (float64, bool) { + for _, extractor := range p.extractors { + if rate, ok := extractor.Extract(span, priority); ok { + return rate, ok + } + } + return 0, false +} + +func (p *Processor) extractionSample(event *pb.Span, extractionRate float64) bool { + return sampler.SampleByRate(event.TraceID, extractionRate) +} + +func (p *Processor) maxEPSSample(event *pb.Span, priority sampler.SamplingPriority) (sampled bool, rate float64) { + if priority == sampler.PriorityUserKeep { + return true, 1 + } + return p.maxEPSSampler.Sample(event) +} + +type eventSampler interface { + Start() + Sample(event *pb.Span) (sampled bool, rate float64) + Stop() +} diff --git a/pkg/trace/event/processor_test.go b/pkg/trace/event/processor_test.go new file mode 100644 index 0000000000000..c6223a6f2ed8a --- /dev/null +++ b/pkg/trace/event/processor_test.go @@ -0,0 +1,136 @@ +package event + +import ( + "math/rand" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/stretchr/testify/assert" +) + +func TestProcessor(t *testing.T) { + tests := []struct { + name string + extractorRates []float64 + samplerRate float64 + priority sampler.SamplingPriority + expectedExtractedPct float64 + expectedSampledPct float64 + deltaPct float64 + }{ + // Name: // + {"none/1/none", nil, 1, sampler.PriorityNone, 0, 0, 0}, + + // Test Extractors + {"0/1/none", []float64{0}, 1, sampler.PriorityNone, 0, 0, 0}, + {"0.5/1/none", []float64{0.5}, 1, sampler.PriorityNone, 0.5, 1, 0.1}, + {"-1,0.8/1/none", []float64{-1, 0.8}, 1, sampler.PriorityNone, 0.8, 1, 0.1}, + {"-1,-1,-0.8/1/none", []float64{-1, -1, 0.8}, 1, sampler.PriorityNone, 0.8, 1, 0.1}, + + // Test MaxEPS sampler + {"1/0/none", []float64{1}, 0, sampler.PriorityNone, 1, 0, 0}, + {"1/0.5/none", []float64{1}, 0.5, sampler.PriorityNone, 1, 0.5, 0.1}, + {"1/1/none", []float64{1}, 1, sampler.PriorityNone, 1, 1, 0}, + + // Test Extractor and Sampler combinations + {"-1,0.8/0.8/none", []float64{-1, 0.8}, 0.8, sampler.PriorityNone, 0.8, 0.8, 0.1}, + {"-1,0.8/0.8/autokeep", []float64{-1, 0.8}, 0.8, sampler.PriorityAutoKeep, 0.8, 0.8, 0.1}, + // Test userkeep bypass of max eps + {"-1,0.8/0.8/userkeep", []float64{-1, 0.8}, 0.8, sampler.PriorityUserKeep, 0.8, 1, 0.1}, + } + + testClientSampleRate := 0.3 + testPreSampleRate := 0.5 + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + assert := assert.New(t) + + extractors := make([]Extractor, len(test.extractorRates)) + for i, rate := range test.extractorRates { + extractors[i] = &MockExtractor{Rate: rate} + } + + testSampler := &MockEventSampler{Rate: test.samplerRate} + p := newProcessor(extractors, testSampler) + + testSpans := createTestSpans("test", "test") + testTrace := agent.ProcessedTrace{WeightedTrace: testSpans} + testTrace.Root = testSpans[0].Span + sampler.SetPreSampleRate(testTrace.Root, testPreSampleRate) + sampler.SetClientRate(testTrace.Root, testClientSampleRate) + if test.priority != sampler.PriorityNone { + sampler.SetSamplingPriority(testTrace.Root, test.priority) + } + + p.Start() + events, extracted := p.Process(testTrace) + p.Stop() + total := len(testSpans) + returned := len(events) + + expectedExtracted := float64(total) * test.expectedExtractedPct + assert.InDelta(expectedExtracted, extracted, expectedExtracted*test.deltaPct) + + expectedReturned := expectedExtracted * test.expectedSampledPct + assert.InDelta(expectedReturned, returned, expectedReturned*test.deltaPct) + + assert.EqualValues(1, testSampler.StartCalls) + assert.EqualValues(1, testSampler.StopCalls) + + expectedSampleCalls := extracted + if test.priority == sampler.PriorityUserKeep { + expectedSampleCalls = 0 + } + assert.EqualValues(expectedSampleCalls, testSampler.SampleCalls) + + for _, event := range events { + assert.EqualValues(test.expectedExtractedPct, sampler.GetEventExtractionRate(event)) + assert.EqualValues(test.expectedSampledPct, sampler.GetMaxEPSRate(event)) + assert.EqualValues(testClientSampleRate, sampler.GetClientRate(event)) + assert.EqualValues(testPreSampleRate, sampler.GetPreSampleRate(event)) + + priority, ok := sampler.GetSamplingPriority(event) + if !ok { + priority = sampler.PriorityNone + } + assert.EqualValues(test.priority, priority) + } + }) + } +} + +type MockExtractor struct { + Rate float64 +} + +func (e *MockExtractor) Extract(s *agent.WeightedSpan, priority sampler.SamplingPriority) (float64, bool) { + if e.Rate < 0 { + return 0, false + } + return e.Rate, true +} + +type MockEventSampler struct { + Rate float64 + + StartCalls int + StopCalls int + SampleCalls int +} + +func (s *MockEventSampler) Start() { + s.StartCalls++ +} + +func (s *MockEventSampler) Stop() { + s.StopCalls++ +} + +func (s *MockEventSampler) Sample(event *pb.Span) (bool, float64) { + s.SampleCalls++ + + return rand.Float64() < s.Rate, s.Rate +} diff --git a/pkg/trace/event/sampler_max_eps.go b/pkg/trace/event/sampler_max_eps.go new file mode 100644 index 0000000000000..a46c88e041938 --- /dev/null +++ b/pkg/trace/event/sampler_max_eps.go @@ -0,0 +1,149 @@ +package event + +import ( + "time" + + log "github.com/cihub/seelog" + + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +const maxEPSReportFrequency = 10 * time.Second + +// maxEPSSampler (Max Events Per Second Sampler) is an event maxEPSSampler that samples provided events so as to try to ensure +// no more than a certain amount of events is sampled per second. +// +// Note that events associated with traces with UserPriorityKeep are always sampled and don't influence underlying +// rate counters so as not to skew stats. +type maxEPSSampler struct { + maxEPS float64 + rateCounter rateCounter + + reportFrequency time.Duration + reportDone chan bool +} + +// NewMaxEPSSampler creates a new instance of a maxEPSSampler with the provided maximum amount of events per second. +func newMaxEPSSampler(maxEPS float64) *maxEPSSampler { + return &maxEPSSampler{ + maxEPS: maxEPS, + rateCounter: newSamplerBackendRateCounter(), + + reportDone: make(chan bool), + } +} + +// Start starts the underlying rate counter. +func (s *maxEPSSampler) Start() { + s.rateCounter.Start() + + go func() { + ticker := time.NewTicker(maxEPSReportFrequency) + defer close(s.reportDone) + defer ticker.Stop() + + for { + select { + case <-s.reportDone: + return + case <-ticker.C: + s.report() + } + } + }() +} + +// Stop stops the underlying rate counter. +func (s *maxEPSSampler) Stop() { + s.reportDone <- true + <-s.reportDone + + s.rateCounter.Stop() +} + +// Sample determines whether or not we should sample the provided event in order to ensure no more than maxEPS events +// are sampled every second. +func (s *maxEPSSampler) Sample(event *pb.Span) (sampled bool, rate float64) { + // Count that we saw a new event + s.rateCounter.Count() + rate = 1.0 + currentEPS := s.rateCounter.GetRate() + if currentEPS > s.maxEPS { + rate = s.maxEPS / currentEPS + } + sampled = sampler.SampleByRate(event.TraceID, rate) + return +} + +// getSampleRate returns the applied sample rate based on this sampler's current state. +func (s *maxEPSSampler) getSampleRate() float64 { + rate := 1.0 + currentEPS := s.rateCounter.GetRate() + if currentEPS > s.maxEPS { + rate = s.maxEPS / currentEPS + } + return rate +} + +func (s *maxEPSSampler) report() { + maxRate := s.maxEPS + metrics.Gauge("datadog.trace_agent.events.max_eps.max_rate", maxRate, nil, 1) + + currentRate := s.rateCounter.GetRate() + metrics.Gauge("datadog.trace_agent.events.max_eps.current_rate", currentRate, nil, 1) + + sampleRate := s.getSampleRate() + metrics.Gauge("datadog.trace_agent.events.max_eps.sample_rate", sampleRate, nil, 1) + + reachedMaxGaugeV := 0. + if sampleRate < 1 { + reachedMaxGaugeV = 1. + log.Warnf("Max events per second reached (current=%.2f/s, max=%.2f/s). "+ + "Some events are now being dropped (sample rate=%.2f). Consider adjusting event sampling rates.", + currentRate, maxRate, sampleRate) + } + metrics.Gauge("datadog.trace_agent.events.max_eps.reached_max", reachedMaxGaugeV, nil, 1) +} + +// rateCounter keeps track of different event rates. +type rateCounter interface { + Start() + Count() + GetRate() float64 + Stop() +} + +// samplerBackendRateCounter is a rateCounter backed by a maxEPSSampler.Backend. +type samplerBackendRateCounter struct { + backend sampler.Backend +} + +// newSamplerBackendRateCounter creates a new samplerBackendRateCounter based on exponential decay counters. +func newSamplerBackendRateCounter() *samplerBackendRateCounter { + return &samplerBackendRateCounter{ + // TODO: Allow these to be configurable or study better defaults based on intended target + backend: sampler.NewMemoryBackend(1*time.Second, 1.125), + } +} + +// Start starts the decaying of the backend rate counter. +func (sb *samplerBackendRateCounter) Start() { + go sb.backend.Run() +} + +// Stop stops the decaying of the backend rate counter. +func (sb *samplerBackendRateCounter) Stop() { + sb.backend.Stop() +} + +// Count adds an event to the rate computation. +func (sb *samplerBackendRateCounter) Count() { + sb.backend.CountSample() +} + +// GetRate gets the current event rate. +func (sb *samplerBackendRateCounter) GetRate() float64 { + return sb.backend.GetUpperSampledScore() +} diff --git a/pkg/trace/event/sampler_max_eps_test.go b/pkg/trace/event/sampler_max_eps_test.go new file mode 100644 index 0000000000000..76ab418d1758b --- /dev/null +++ b/pkg/trace/event/sampler_max_eps_test.go @@ -0,0 +1,74 @@ +package event + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + "github.com/stretchr/testify/assert" +) + +func TestMaxEPSSampler(t *testing.T) { + for _, testCase := range []struct { + name string + events []*pb.Span + maxEPS float64 + pastEPS float64 + expectedSampleRate float64 + deltaPct float64 + }{ + {"low", generateTestEvents(1000), 100, 50, 1., 0}, + {"limit", generateTestEvents(1000), 100, 100, 1., 0}, + {"overload", generateTestEvents(1000), 100, 150, 100. / 150., 0.05}, + } { + t.Run(testCase.name, func(t *testing.T) { + assert := assert.New(t) + + counter := &MockRateCounter{ + GetRateResult: testCase.pastEPS, + } + testSampler := newMaxEPSSampler(testCase.maxEPS) + testSampler.rateCounter = counter + testSampler.Start() + + sampled := 0 + for _, event := range testCase.events { + sample, rate := testSampler.Sample(event) + if sample { + sampled++ + } + assert.EqualValues(testCase.expectedSampleRate, rate) + } + + testSampler.Stop() + + assert.InDelta(testCase.expectedSampleRate, float64(sampled)/float64(len(testCase.events)), testCase.expectedSampleRate*testCase.deltaPct) + }) + } +} + +func generateTestEvents(numEvents int) []*pb.Span { + testEvents := make([]*pb.Span, numEvents) + for i, _ := range testEvents { + testEvents[i] = testutil.RandomSpan() + } + return testEvents +} + +type MockRateCounter struct { + CountCalls int + GetRateCalls int + GetRateResult float64 +} + +func (mc *MockRateCounter) Start() {} +func (mc *MockRateCounter) Stop() {} + +func (mc *MockRateCounter) Count() { + mc.CountCalls++ +} + +func (mc *MockRateCounter) GetRate() float64 { + mc.GetRateCalls++ + return mc.GetRateResult +} diff --git a/pkg/trace/filters/blacklister.go b/pkg/trace/filters/blacklister.go new file mode 100644 index 0000000000000..bce4b7ac80857 --- /dev/null +++ b/pkg/trace/filters/blacklister.go @@ -0,0 +1,44 @@ +package filters + +import ( + "regexp" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" +) + +// Blacklister holds a list of regular expressions which will match resources +// on spans that should be dropped. +type Blacklister struct { + list []*regexp.Regexp +} + +// Allows returns true if the Blacklister permits this span. +func (f *Blacklister) Allows(span *pb.Span) bool { + for _, entry := range f.list { + if entry.MatchString(span.Resource) { + return false + } + } + return true +} + +// NewBlacklister creates a new Blacklister based on the given list of +// regular expressions. +func NewBlacklister(exprs []string) *Blacklister { + return &Blacklister{list: compileRules(exprs)} +} + +// compileRules compiles as many rules as possible from the list of expressions. +func compileRules(exprs []string) []*regexp.Regexp { + list := make([]*regexp.Regexp, 0, len(exprs)) + for _, entry := range exprs { + rule, err := regexp.Compile(entry) + if err != nil { + log.Errorf("invalid resource filter: %q", entry) + continue + } + list = append(list, rule) + } + return list +} diff --git a/pkg/trace/filters/blacklister_test.go b/pkg/trace/filters/blacklister_test.go new file mode 100644 index 0000000000000..2984abe20b3b2 --- /dev/null +++ b/pkg/trace/filters/blacklister_test.go @@ -0,0 +1,48 @@ +package filters + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + + "github.com/stretchr/testify/assert" +) + +func TestBlacklister(t *testing.T) { + tests := []struct { + filter []string + resource string + expectation bool + }{ + {[]string{"/foo/bar"}, "/foo/bar", false}, + {[]string{"/foo/b.r"}, "/foo/bar", false}, + {[]string{"[0-9]+"}, "/abcde", true}, + {[]string{"[0-9]+"}, "/abcde123", false}, + {[]string{"\\(foobar\\)"}, "(foobar)", false}, + {[]string{"\\(foobar\\)"}, "(bar)", true}, + {[]string{"(GET|POST) /healthcheck"}, "GET /foobar", true}, + {[]string{"(GET|POST) /healthcheck"}, "GET /healthcheck", false}, + {[]string{"(GET|POST) /healthcheck"}, "POST /healthcheck", false}, + {[]string{"SELECT COUNT\\(\\*\\) FROM BAR"}, "SELECT COUNT(*) FROM BAR", false}, + {[]string{"[123"}, "[123", true}, + {[]string{"\\[123"}, "[123", false}, + {[]string{"ABC+", "W+"}, "ABCCCC", false}, + {[]string{"ABC+", "W+"}, "WWW", false}, + } + + for _, test := range tests { + span := testutil.RandomSpan() + span.Resource = test.resource + filter := NewBlacklister(test.filter) + + assert.Equal(t, test.expectation, filter.Allows(span)) + } +} + +func TestCompileRules(t *testing.T) { + filter := NewBlacklister([]string{"[123", "]123", "{6}"}) + for i := 0; i < 100; i++ { + span := testutil.RandomSpan() + assert.True(t, filter.Allows(span)) + } +} diff --git a/pkg/trace/filters/replacer.go b/pkg/trace/filters/replacer.go new file mode 100644 index 0000000000000..1068f90866152 --- /dev/null +++ b/pkg/trace/filters/replacer.go @@ -0,0 +1,43 @@ +package filters + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// Replacer is a filter which replaces tag values based on its +// settings. It keeps all spans. +type Replacer struct { + rules []*config.ReplaceRule +} + +// NewReplacer returns a new Replacer which will use the given set of rules. +func NewReplacer(rules []*config.ReplaceRule) *Replacer { + return &Replacer{rules: rules} +} + +// Replace replaces all tags matching the Replacer's rules. +func (f Replacer) Replace(trace *pb.Trace) { + for _, rule := range f.rules { + key, str, re := rule.Name, rule.Repl, rule.Re + for _, s := range *trace { + switch key { + case "*": + for k := range s.Meta { + s.Meta[k] = re.ReplaceAllString(s.Meta[k], str) + } + s.Resource = re.ReplaceAllString(s.Resource, str) + case "resource.name": + s.Resource = re.ReplaceAllString(s.Resource, str) + default: + if s.Meta == nil { + continue + } + if _, ok := s.Meta[key]; !ok { + continue + } + s.Meta[key] = re.ReplaceAllString(s.Meta[key], str) + } + } + } +} diff --git a/pkg/trace/filters/replacer_test.go b/pkg/trace/filters/replacer_test.go new file mode 100644 index 0000000000000..0829ed9f2fd8f --- /dev/null +++ b/pkg/trace/filters/replacer_test.go @@ -0,0 +1,130 @@ +package filters + +import ( + "regexp" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func TestReplacer(t *testing.T) { + assert := assert.New(t) + for _, tt := range []struct { + rules [][3]string + got, want map[string]string + }{ + { + rules: [][3]string{ + {"http.url", "(token/)([^/]*)", "${1}?"}, + {"http.url", "guid", "[REDACTED]"}, + {"custom.tag", "(/foo/bar/).*", "${1}extra"}, + {"a", "b", "c"}, + }, + got: map[string]string{ + "http.url": "some/guid/token/abcdef/abc", + "custom.tag": "/foo/bar/foo", + }, + want: map[string]string{ + "http.url": "some/[REDACTED]/token/?/abc", + "custom.tag": "/foo/bar/extra", + }, + }, + { + rules: [][3]string{ + {"*", "(token/)([^/]*)", "${1}?"}, + {"*", "this", "that"}, + {"http.url", "guid", "[REDACTED]"}, + {"custom.tag", "(/foo/bar/).*", "${1}extra"}, + {"resource.name", "prod", "stage"}, + }, + got: map[string]string{ + "resource.name": "this is prod", + "http.url": "some/[REDACTED]/token/abcdef/abc", + "other.url": "some/guid/token/abcdef/abc", + "custom.tag": "/foo/bar/foo", + }, + want: map[string]string{ + "resource.name": "that is stage", + "http.url": "some/[REDACTED]/token/?/abc", + "other.url": "some/guid/token/?/abc", + "custom.tag": "/foo/bar/extra", + }, + }, + } { + rules := parseRulesFromString(tt.rules) + tr := NewReplacer(rules) + root := replaceFilterTestSpan(tt.got) + childSpan := replaceFilterTestSpan(tt.got) + trace := pb.Trace{root, childSpan} + tr.Replace(&trace) + for k, v := range tt.want { + switch k { + case "resource.name": + // test that the filter applies to all spans, not only the root + assert.Equal(v, root.Resource) + assert.Equal(v, childSpan.Resource) + default: + assert.Equal(v, root.Meta[k]) + assert.Equal(v, childSpan.Meta[k]) + } + } + } +} + +func parseRulesFromString(rules [][3]string) []*config.ReplaceRule { + r := make([]*config.ReplaceRule, 0, len(rules)) + for _, rule := range rules { + key, re, str := rule[0], rule[1], rule[2] + r = append(r, &config.ReplaceRule{ + Name: key, + Pattern: re, + Re: regexp.MustCompile(re), + Repl: str, + }) + } + return r +} + +// replaceFilterTestSpan creates a span from a list of tags and uses +// special tag names (e.g. resource.name) to target attributes. +func replaceFilterTestSpan(tags map[string]string) *pb.Span { + span := &pb.Span{Meta: make(map[string]string)} + for k, v := range tags { + switch k { + case "resource.name": + span.Resource = v + default: + span.Meta[k] = v + } + } + return span +} + +// TestReplaceFilterTestSpan tests the replaceFilterTestSpan test +// helper function. +func TestReplaceFilterTestSpan(t *testing.T) { + for _, tt := range []struct { + tags map[string]string + want *pb.Span + }{ + { + tags: map[string]string{ + "resource.name": "a", + "http.url": "url", + "custom.tag": "val", + }, + want: &pb.Span{ + Resource: "a", + Meta: map[string]string{ + "http.url": "url", + "custom.tag": "val", + }, + }, + }, + } { + got := replaceFilterTestSpan(tt.tags) + assert.Equal(t, tt.want, got) + } +} diff --git a/pkg/trace/flags/flags.go b/pkg/trace/flags/flags.go new file mode 100644 index 0000000000000..dcf02be34d524 --- /dev/null +++ b/pkg/trace/flags/flags.go @@ -0,0 +1,51 @@ +package flags + +import "flag" + +var ( + // ConfigPath specifies the path to the configuration file. + ConfigPath string + + // PIDFilePath specifies the path to the PID file. + PIDFilePath string + + // LogLevel specifies the log output level. + LogLevel string + + // Version will cause the agent to show version information. + Version bool + + // Info will display information about a running agent. + Info bool + + // CPUProfile specifies the path to output CPU profiling information to. + // When empty, CPU profiling is disabled. + CPUProfile string + + // MemProfile specifies the path to output memory profiling information to. + // When empty, memory profiling is disabled. + MemProfile string +) + +// Win holds a set of flags which will be populated only during the Windows build. +var Win = struct { + InstallService bool + UninstallService bool + StartService bool + StopService bool +}{} + +func init() { + flag.StringVar(&ConfigPath, "config", DefaultConfigPath, "Datadog Agent config file location") + flag.StringVar(&PIDFilePath, "pid", "", "Path to set pidfile for process") + flag.BoolVar(&Version, "version", false, "Show version information and exit") + flag.BoolVar(&Info, "info", false, "Show info about running trace agent process and exit") + + // profiling + flag.StringVar(&CPUProfile, "cpuprofile", "", "Write cpu profile to file") + flag.StringVar(&MemProfile, "memprofile", "", "Write memory profile to `file`") + + registerOSSpecificFlags() + + flag.Parse() +} diff --git a/pkg/trace/flags/flags_nix.go b/pkg/trace/flags/flags_nix.go new file mode 100644 index 0000000000000..1432e68c0b131 --- /dev/null +++ b/pkg/trace/flags/flags_nix.go @@ -0,0 +1,8 @@ +// +build !windows + +package flags + +// DefaultConfigPath specifies the default configuration file path for non-Windows systems. +const DefaultConfigPath = "/opt/datadog-agent/etc/datadog.yaml" + +func registerOSSpecificFlags() {} diff --git a/pkg/trace/flags/flags_windows.go b/pkg/trace/flags/flags_windows.go new file mode 100644 index 0000000000000..6156f624cd0d4 --- /dev/null +++ b/pkg/trace/flags/flags_windows.go @@ -0,0 +1,25 @@ +// +build windows + +package flags + +import ( + "flag" + "path/filepath" + + "github.com/DataDog/datadog-agent/pkg/util/winutil" +) + +var DefaultConfigPath = "c:\\programdata\\datadog\\datadog.yaml" + +func init() { + pd, err := winutil.GetProgramDataDir() + if err == nil { + DefaultConfigPath = filepath.Join(pd, "Datadog", "datadog.yaml") + } +} +func registerOSSpecificFlags() { + flag.BoolVar(&Win.InstallService, "install-service", false, "Install the trace agent to the Service Control Manager") + flag.BoolVar(&Win.UninstallService, "uninstall-service", false, "Remove the trace agent from the Service Control Manager") + flag.BoolVar(&Win.StartService, "start-service", false, "Starts the trace agent service") + flag.BoolVar(&Win.StopService, "stop-service", false, "Stops the trace agent service") +} diff --git a/pkg/trace/info/endpoint.go b/pkg/trace/info/endpoint.go new file mode 100644 index 0000000000000..1bb4f73173959 --- /dev/null +++ b/pkg/trace/info/endpoint.go @@ -0,0 +1,27 @@ +package info + +// EndpointStats contains stats about the volume of data written +type EndpointStats struct { + // TracesPayload is the number of traces payload sent, including errors. + // If several URLs are given, each URL counts for one. + TracesPayload int64 + // TracesPayloadError is the number of traces payload sent with an error. + // If several URLs are given, each URL counts for one. + TracesPayloadError int64 + // TracesBytes is the size of the traces payload data sent, including errors. + // If several URLs are given, it does not change the size (shared for all). + // This is the raw data, encoded, compressed. + TracesBytes int64 + // TracesStats is the number of stats in the traces payload data sent, including errors. + // If several URLs are given, it does not change the size (shared for all). + TracesStats int64 + // TracesPayload is the number of services payload sent, including errors. + // If several URLs are given, each URL counts for one. + ServicesPayload int64 + // ServicesPayloadError is the number of services payload sent with an error. + // If several URLs are given, each URL counts for one. + ServicesPayloadError int64 + // TracesBytes is the size of the services payload data sent, including errors. + // If several URLs are given, it does not change the size (shared for all). + ServicesBytes int64 +} diff --git a/pkg/trace/info/git_version.go b/pkg/trace/info/git_version.go new file mode 100644 index 0000000000000..676f5888270e5 --- /dev/null +++ b/pkg/trace/info/git_version.go @@ -0,0 +1,17 @@ + +// Code generated by 'go run make.go'. DO NOT EDIT. + +package info + +import ( + "runtime" + "strings" +) + +func init() { + Version = "0.99.0" + GitCommit = "90dcab81" + GitBranch = "HEAD" + BuildDate = "2019-01-08 10:02:22.993077 +0100 CET m=+0.007560824" + GoVersion = strings.TrimPrefix(runtime.Version(), "go") +} diff --git a/pkg/trace/info/info.go b/pkg/trace/info/info.go new file mode 100644 index 0000000000000..66716b72ca88d --- /dev/null +++ b/pkg/trace/info/info.go @@ -0,0 +1,424 @@ +package info + +import ( + "bytes" + "encoding/json" + "expvar" // automatically publish `/debug/vars` on HTTP port + + "fmt" + "io" + "net/http" + "os" + "regexp" + "strings" + "sync" + "text/template" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +var ( + infoMu sync.RWMutex + receiverStats []TagStats // only for the last minute + languages []string + + // TODO: move from package globals to a clean single struct + + traceWriterInfo TraceWriterInfo + statsWriterInfo StatsWriterInfo + serviceWriterInfo ServiceWriterInfo + + watchdogInfo watchdog.Info + samplerInfo SamplerInfo + prioritySamplerInfo SamplerInfo + errorsSamplerInfo SamplerInfo + rateByService map[string]float64 + preSamplerStats sampler.PreSamplerStats + start = time.Now() + once sync.Once + infoTmpl *template.Template + notRunningTmpl *template.Template + errorTmpl *template.Template +) + +const ( + infoTmplSrc = `{{.Banner}} +{{.Program}} +{{.Banner}} + + Pid: {{.Status.Pid}} + Uptime: {{.Status.Uptime}} seconds + Mem alloc: {{.Status.MemStats.Alloc}} bytes + + Hostname: {{.Status.Config.Hostname}} + Receiver: {{.Status.Config.ReceiverHost}}:{{.Status.Config.ReceiverPort}} + Endpoints: + {{ range $i, $e := .Status.Config.Endpoints}} + {{ $e.Host }} + {{end}} + + --- Receiver stats (1 min) --- + + {{ range $i, $ts := .Status.Receiver }} + From {{if $ts.Tags.Lang}}{{ $ts.Tags.Lang }} {{ $ts.Tags.LangVersion }} ({{ $ts.Tags.Interpreter }}), client {{ $ts.Tags.TracerVersion }}{{else}}unknown clients{{end}} + Traces received: {{ $ts.Stats.TracesReceived }} ({{ $ts.Stats.TracesBytes }} bytes) + Spans received: {{ $ts.Stats.SpansReceived }} + Services received: {{ $ts.Stats.ServicesReceived }} ({{ $ts.Stats.ServicesBytes }} bytes) + {{if gt $ts.Stats.TracesDropped 0}} + WARNING: Traces dropped: {{ $ts.Stats.TracesDropped }} + {{end}} + {{if gt $ts.Stats.SpansDropped 0}} + WARNING: Spans dropped: {{ $ts.Stats.SpansDropped }} + {{end}} + + {{end}} + {{ range $key, $value := .Status.RateByService }} + Priority sampling rate for '{{ $key }}': {{percent $value}} % + {{ end }} + {{if lt .Status.PreSampler.Rate 1.0}} + WARNING: Pre-sampling traces: {{percent .Status.PreSampler.Rate}} % + {{end}} + {{if .Status.PreSampler.Error}} + WARNING: Pre-sampler: {{.Status.PreSampler.Error}} + {{end}} + + --- Writer stats (1 min) --- + + Traces: {{.Status.TraceWriter.Payloads}} payloads, {{.Status.TraceWriter.Traces}} traces, {{if gt .Status.TraceWriter.Events 0}}{{.Status.TraceWriter.Events}} events, {{end}}{{.Status.TraceWriter.Bytes}} bytes + {{if gt .Status.TraceWriter.Errors 0}}WARNING: Traces API errors (1 min): {{.Status.TraceWriter.Errors}}{{end}} + Stats: {{.Status.StatsWriter.Payloads}} payloads, {{.Status.StatsWriter.StatsBuckets}} stats buckets, {{.Status.StatsWriter.Bytes}} bytes + {{if gt .Status.StatsWriter.Errors 0}}WARNING: Stats API errors (1 min): {{.Status.StatsWriter.Errors}}{{end}} + Services: {{.Status.ServiceWriter.Payloads}} payloads, {{.Status.ServiceWriter.Services}} services, {{.Status.ServiceWriter.Bytes}} bytes + {{if gt .Status.ServiceWriter.Errors 0}}WARNING: Services API errors (1 min): {{.Status.ServiceWriter.Errors}}{{end}} +` + + notRunningTmplSrc = `{{.Banner}} +{{.Program}} +{{.Banner}} + + Not running (port {{.ReceiverPort}}) + +` + + errorTmplSrc = `{{.Banner}} +{{.Program}} +{{.Banner}} + + Error: {{.Error}} + URL: {{.URL}} + +` +) + +// UpdateReceiverStats updates internal stats about the receiver. +func UpdateReceiverStats(rs *ReceiverStats) { + infoMu.Lock() + defer infoMu.Unlock() + rs.RLock() + defer rs.RUnlock() + + s := make([]TagStats, 0, len(rs.Stats)) + for _, tagStats := range rs.Stats { + if !tagStats.isEmpty() { + s = append(s, *tagStats) + } + } + + receiverStats = s + languages = rs.Languages() +} + +// Languages exposes languages reporting traces to the Agent. +func Languages() []string { + infoMu.Lock() + defer infoMu.Unlock() + + return languages +} + +func publishReceiverStats() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return receiverStats +} + +// UpdateSamplerInfo updates internal stats about signature sampling. +func UpdateSamplerInfo(ss SamplerInfo) { + infoMu.Lock() + defer infoMu.Unlock() + + samplerInfo = ss +} + +func publishSamplerInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return samplerInfo +} + +// UpdatePrioritySamplerInfo updates internal stats about priority sampling. +func UpdatePrioritySamplerInfo(ss SamplerInfo) { + infoMu.Lock() + defer infoMu.Unlock() + + prioritySamplerInfo = ss +} + +func publishPrioritySamplerInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return prioritySamplerInfo +} + +// UpdateErrorsSamplerInfo updates internal stats about error sampling. +func UpdateErrorsSamplerInfo(ss SamplerInfo) { + infoMu.Lock() + defer infoMu.Unlock() + + errorsSamplerInfo = ss +} + +func publishErrorsSamplerInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return errorsSamplerInfo +} + +// UpdateRateByService updates the RateByService map. +func UpdateRateByService(rbs map[string]float64) { + infoMu.Lock() + defer infoMu.Unlock() + rateByService = rbs +} + +func publishRateByService() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return rateByService +} + +// UpdateWatchdogInfo updates internal stats about the watchdog. +func UpdateWatchdogInfo(wi watchdog.Info) { + infoMu.Lock() + defer infoMu.Unlock() + watchdogInfo = wi +} + +func publishWatchdogInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return watchdogInfo +} + +// UpdatePreSampler updates internal stats about the pre-sampling. +func UpdatePreSampler(ss sampler.PreSamplerStats) { + infoMu.Lock() + defer infoMu.Unlock() + preSamplerStats = ss +} + +func publishPreSamplerStats() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return preSamplerStats +} + +func publishUptime() interface{} { + return int(time.Since(start) / time.Second) +} + +type infoString string + +func (s infoString) String() string { return string(s) } + +// InitInfo initializes the info structure. It should be called only once. +func InitInfo(conf *config.AgentConfig) error { + var err error + + funcMap := template.FuncMap{ + "add": func(a, b int64) int64 { + return a + b + }, + "percent": func(v float64) string { + return fmt.Sprintf("%02.1f", v*100) + }, + } + + once.Do(func() { + expvar.NewInt("pid").Set(int64(os.Getpid())) + expvar.Publish("uptime", expvar.Func(publishUptime)) + expvar.Publish("version", expvar.Func(publishVersion)) + expvar.Publish("receiver", expvar.Func(publishReceiverStats)) + expvar.Publish("sampler", expvar.Func(publishSamplerInfo)) + expvar.Publish("trace_writer", expvar.Func(publishTraceWriterInfo)) + expvar.Publish("stats_writer", expvar.Func(publishStatsWriterInfo)) + expvar.Publish("service_writer", expvar.Func(publishServiceWriterInfo)) + expvar.Publish("prioritysampler", expvar.Func(publishPrioritySamplerInfo)) + expvar.Publish("errorssampler", expvar.Func(publishErrorsSamplerInfo)) + expvar.Publish("ratebyservice", expvar.Func(publishRateByService)) + expvar.Publish("watchdog", expvar.Func(publishWatchdogInfo)) + expvar.Publish("presampler", expvar.Func(publishPreSamplerStats)) + + // copy the config to ensure we don't expose sensitive data such as API keys + c := *conf + c.Endpoints = make([]*config.Endpoint, len(conf.Endpoints)) + for i, e := range conf.Endpoints { + c.Endpoints[i] = &config.Endpoint{Host: e.Host, NoProxy: e.NoProxy} + } + + var buf []byte + buf, err = json.Marshal(&c) + if err != nil { + return + } + + // We keep a static copy of the config, already marshalled and stored + // as a plain string. This saves the hassle of rebuilding it all the time + // and avoids race issues as the source object is never used again. + // Config is parsed at the beginning and never changed again, anyway. + expvar.Publish("config", infoString(string(buf))) + + infoTmpl, err = template.New("info").Funcs(funcMap).Parse(infoTmplSrc) + if err != nil { + return + } + + notRunningTmpl, err = template.New("infoNotRunning").Parse(notRunningTmplSrc) + if err != nil { + return + } + + errorTmpl, err = template.New("infoError").Parse(errorTmplSrc) + if err != nil { + return + } + }) + + return err +} + +// StatusInfo is what we use to parse expvar response. +// It does not need to contain all the fields, only those we need +// to display when called with `-info` as JSON unmarshaller will +// automatically ignore extra fields. +type StatusInfo struct { + CmdLine []string `json:"cmdline"` + Pid int `json:"pid"` + Uptime int `json:"uptime"` + MemStats struct { + Alloc uint64 + } `json:"memstats"` + Version infoVersion `json:"version"` + Receiver []TagStats `json:"receiver"` + RateByService map[string]float64 `json:"ratebyservice"` + TraceWriter TraceWriterInfo `json:"trace_writer"` + StatsWriter StatsWriterInfo `json:"stats_writer"` + ServiceWriter ServiceWriterInfo `json:"service_writer"` + Watchdog watchdog.Info `json:"watchdog"` + PreSampler sampler.PreSamplerStats `json:"presampler"` + Config config.AgentConfig `json:"config"` +} + +func getProgramBanner(version string) (string, string) { + program := fmt.Sprintf("Trace Agent (v %s)", version) + banner := strings.Repeat("=", len(program)) + + return program, banner +} + +// Info writes a standard info message describing the running agent. +// This is not the current program, but an already running program, +// which we query with an HTTP request. +// +// If error is nil, means the program is running. +// If not, it displays a pretty-printed message anyway (for support) +func Info(w io.Writer, conf *config.AgentConfig) error { + host := conf.ReceiverHost + if host == "0.0.0.0" { + host = "127.0.0.1" // [FIXME:christian] not fool-proof + } + url := fmt.Sprintf("http://%s:%d/debug/vars", conf.ReceiverHost, conf.ReceiverPort) + client := http.Client{Timeout: 3 * time.Second} + resp, err := client.Get(url) + if err != nil { + // OK, here, we can't even make an http call on the agent port, + // so we can assume it's not even running, or at least, not with + // these parameters. We display the port as a hint on where to + // debug further, this is where the expvar JSON should come from. + program, banner := getProgramBanner(Version) + _ = notRunningTmpl.Execute(w, struct { + Banner string + Program string + ReceiverPort int + }{ + Banner: banner, + Program: program, + ReceiverPort: conf.ReceiverPort, + }) + return err + } + + defer resp.Body.Close() // OK to defer, this is not on hot path + + var info StatusInfo + if err := json.NewDecoder(resp.Body).Decode(&info); err != nil { + program, banner := getProgramBanner(Version) + _ = errorTmpl.Execute(w, struct { + Banner string + Program string + Error error + URL string + }{ + Banner: banner, + Program: program, + Error: err, + URL: url, + }) + return err + } + + // display the remote program version, now that we know it + program, banner := getProgramBanner(info.Version.Version) + + // remove the default service and env, it can be inferred from other + // values so has little added-value and could be confusing for users. + // Besides, if one still really wants it: + // curl http://localhost:8126/debug/vars would show it. + if info.RateByService != nil { + delete(info.RateByService, "service:,env:") + } + + var buffer bytes.Buffer + + err = infoTmpl.Execute(&buffer, struct { + Banner string + Program string + Status *StatusInfo + }{ + Banner: banner, + Program: program, + Status: &info, + }) + if err != nil { + return err + } + + cleanInfo := CleanInfoExtraLines(buffer.String()) + + w.Write([]byte(cleanInfo)) + // w.Write(buffer.Bytes()) + + return nil +} + +// CleanInfoExtraLines removes empty lines from template code indentation. +// The idea is that an indented empty line (only indentation spaces) is because of code indentation, +// so we remove it. +// Real legit empty lines contain no space. +func CleanInfoExtraLines(info string) string { + var indentedEmptyLines = regexp.MustCompile("\n( +\n)+") + return indentedEmptyLines.ReplaceAllString(info, "\n") +} diff --git a/pkg/trace/info/info_test.go b/pkg/trace/info/info_test.go new file mode 100644 index 0000000000000..f20ec654eac35 --- /dev/null +++ b/pkg/trace/info/info_test.go @@ -0,0 +1,355 @@ +package info + +import ( + "bytes" + "encoding/json" + "expvar" + "fmt" + "io/ioutil" + "net/http" + "net/http/httptest" + "net/url" + "regexp" + "strconv" + "strings" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/stretchr/testify/assert" +) + +type testServerHandler struct { + t *testing.T +} + +func (h *testServerHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + json, err := ioutil.ReadFile("./testdata/okay.json") + if err != nil { + h.t.Errorf("error loading json file: %v", err) + } + + switch r.URL.Path { + case "/debug/vars": + h.t.Logf("serving fake (static) info data for %s", r.URL.Path) + _, err := w.Write(json) + if err != nil { + h.t.Errorf("error serving %s: %v", r.URL.Path, err) + } + default: + h.t.Logf("answering 404 for %s", r.URL.Path) + w.WriteHeader(http.StatusNotFound) + } +} + +func testServer(t *testing.T) *httptest.Server { + server := httptest.NewServer(&testServerHandler{t: t}) + t.Logf("test server (serving fake yet valid data) listening on %s", server.URL) + return server +} + +type testServerWarningHandler struct { + t *testing.T +} + +func (h *testServerWarningHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + json, err := ioutil.ReadFile("./testdata/warning.json") + if err != nil { + h.t.Errorf("error loading json file: %v", err) + } + + switch r.URL.Path { + case "/debug/vars": + h.t.Logf("serving fake (static) info data for %s", r.URL.Path) + _, err := w.Write(json) + if err != nil { + h.t.Errorf("error serving %s: %v", r.URL.Path, err) + } + default: + h.t.Logf("answering 404 for %s", r.URL.Path) + w.WriteHeader(http.StatusNotFound) + } +} + +func testServerWarning(t *testing.T) *httptest.Server { + server := httptest.NewServer(&testServerWarningHandler{t: t}) + t.Logf("test server (serving data containing worrying values) listening on %s", server.URL) + return server +} + +type testServerErrorHandler struct { + t *testing.T +} + +func (h *testServerErrorHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + switch r.URL.Path { + case "/debug/vars": + h.t.Logf("serving fake (static) info data for %s", r.URL.Path) + _, err := w.Write([]byte(`this is *NOT* a valid JSON, no way...`)) + if err != nil { + h.t.Errorf("error serving %s: %v", r.URL.Path, err) + } + default: + h.t.Logf("answering 404 for %s", r.URL.Path) + w.WriteHeader(http.StatusNotFound) + } +} + +func testServerError(t *testing.T) *httptest.Server { + server := httptest.NewServer(&testServerErrorHandler{t: t}) + t.Logf("test server (serving bad data to trigger errors) listening on %s", server.URL) + return server +} + +// run this at the beginning of each test, this is because we *really* +// need to have InitInfo be called before doing anything +func testInit(t *testing.T) *config.AgentConfig { + assert := assert.New(t) + conf := config.New() + conf.Endpoints[0].APIKey = "key1" + conf.Endpoints = append(conf.Endpoints, &config.Endpoint{Host: "ABC", APIKey: "key2"}) + assert.NotNil(conf) + + err := InitInfo(conf) + assert.Nil(err) + + return conf +} + +func TestInfo(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + assert.NotNil(conf) + + server := testServer(t) + assert.NotNil(server) + defer server.Close() + + url, err := url.Parse(server.URL) + assert.NotNil(url) + assert.Nil(err) + + hostPort := strings.Split(url.Host, ":") + assert.Equal(2, len(hostPort)) + port, err := strconv.Atoi(hostPort[1]) + assert.Nil(err) + conf.ReceiverPort = port + + var buf bytes.Buffer + err = Info(&buf, conf) + assert.NoError(err) + info := buf.String() + t.Logf("Info:\n%s\n", info) + expectedInfo, err := ioutil.ReadFile("./testdata/okay.info") + assert.NoError(err) + assert.Equal(string(expectedInfo), info) +} + +func TestHideAPIKeys(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + + js := expvar.Get("config").String() + assert.NotEqual("", js) + var got config.AgentConfig + err := json.Unmarshal([]byte(js), &got) + assert.Nil(err) + assert.NotEmpty(conf.Endpoints[0].APIKey) + assert.Empty(got.Endpoints[0].APIKey) +} + +func TestWarning(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + assert.NotNil(conf) + + server := testServerWarning(t) + assert.NotNil(server) + defer server.Close() + + url, err := url.Parse(server.URL) + assert.NotNil(url) + assert.Nil(err) + + hostPort := strings.Split(url.Host, ":") + assert.Equal(2, len(hostPort)) + port, err := strconv.Atoi(hostPort[1]) + assert.Nil(err) + conf.ReceiverPort = port + + var buf bytes.Buffer + err = Info(&buf, conf) + assert.Nil(err) + info := buf.String() + + expectedWarning, err := ioutil.ReadFile("./testdata/warning.info") + assert.NoError(err) + assert.Equal(string(expectedWarning), info) + + t.Logf("Info:\n%s\n", info) +} + +func TestNotRunning(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + assert.NotNil(conf) + + server := testServer(t) + assert.NotNil(server) + + url, err := url.Parse(server.URL) + assert.NotNil(url) + assert.Nil(err) + + server.Close() + + hostPort := strings.Split(url.Host, ":") + assert.Equal(2, len(hostPort)) + port, err := strconv.Atoi(hostPort[1]) + assert.Nil(err) + conf.ReceiverPort = port + + var buf bytes.Buffer + err = Info(&buf, conf) + assert.NotNil(err) + info := buf.String() + + t.Logf("Info:\n%s\n", info) + + lines := strings.Split(info, "\n") + assert.Equal(7, len(lines)) + assert.Regexp(regexp.MustCompile(`^={10,100}$`), lines[0]) + assert.Regexp(regexp.MustCompile(`^Trace Agent \(v.*\)$`), lines[1]) + assert.Regexp(regexp.MustCompile(`^={10,100}$`), lines[2]) + assert.Equal(len(lines[1]), len(lines[0])) + assert.Equal(len(lines[1]), len(lines[2])) + assert.Equal("", lines[3]) + assert.Equal(fmt.Sprintf(" Not running (port %d)", port), lines[4]) + assert.Equal("", lines[5]) + assert.Equal("", lines[6]) +} + +func TestError(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + assert.NotNil(conf) + + server := testServerError(t) + assert.NotNil(server) + defer server.Close() + + url, err := url.Parse(server.URL) + assert.NotNil(url) + assert.Nil(err) + + hostPort := strings.Split(url.Host, ":") + assert.Equal(2, len(hostPort)) + port, err := strconv.Atoi(hostPort[1]) + assert.Nil(err) + conf.ReceiverPort = port + + var buf bytes.Buffer + err = Info(&buf, conf) + assert.NotNil(err) + info := buf.String() + + t.Logf("Info:\n%s\n", info) + + lines := strings.Split(info, "\n") + assert.Equal(8, len(lines)) + assert.Regexp(regexp.MustCompile(`^={10,100}$`), lines[0]) + assert.Regexp(regexp.MustCompile(`^Trace Agent \(v.*\)$`), lines[1]) + assert.Regexp(regexp.MustCompile(`^={10,100}$`), lines[2]) + assert.Equal(len(lines[1]), len(lines[0])) + assert.Equal(len(lines[1]), len(lines[2])) + assert.Equal("", lines[3]) + assert.Regexp(regexp.MustCompile(`^ Error: .*$`), lines[4]) + assert.Equal(fmt.Sprintf(" URL: http://localhost:%d/debug/vars", port), lines[5]) + assert.Equal("", lines[6]) + assert.Equal("", lines[7]) +} + +func TestInfoReceiverStats(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + assert.NotNil(conf) + + stats := NewReceiverStats() + t1 := &TagStats{ + Tags{Lang: "python"}, + Stats{TracesReceived: 23, TracesDropped: 2, TracesBytes: 3244, SpansReceived: 213, SpansDropped: 14}, + } + t2 := &TagStats{ + Tags{Lang: "go"}, + Stats{ServicesReceived: 4, ServicesBytes: 1543}, + } + stats.Stats = map[Tags]*TagStats{ + t1.Tags: t1, + t2.Tags: t2, + } + + // run this with -race flag + done := make(chan struct{}, 4) + for i := 0; i < 2; i++ { + go func() { + for j := 0; j < 1000; j++ { + UpdateReceiverStats(stats) + } + done <- struct{}{} + }() + } + for i := 0; i < 2; i++ { + go func() { + for j := 0; j < 1000; j++ { + _ = publishReceiverStats() + } + done <- struct{}{} + }() + } + for i := 0; i < 4; i++ { + <-done + } + s := publishReceiverStats() + switch s := s.(type) { + case []TagStats: + for _, tagStats := range s { + assert.Equal(*stats.Stats[tagStats.Tags], tagStats) + } + default: + t.Errorf("bad stats type: %v", s) + } + stats.Stats[t1.Tags].TracesReceived++ + UpdateReceiverStats(stats) + s = publishReceiverStats() + switch s := s.(type) { + case []TagStats: + for _, tagStats := range s { + if tagStats.Tags == t1.Tags { + assert.Equal(t1.Stats.TracesReceived, tagStats.Stats.TracesReceived) + } + } + default: + t.Errorf("bad stats type: %v", s) + } +} + +func TestInfoConfig(t *testing.T) { + assert := assert.New(t) + conf := testInit(t) + assert.NotNil(conf) + + js := expvar.Get("config").String() // this is what expvar will call + assert.NotEqual("", js) + var confCopy config.AgentConfig + err := json.Unmarshal([]byte(js), &confCopy) + assert.Nil(err) + for i, e := range confCopy.Endpoints { + assert.Equal("", e.APIKey, "API Keys should *NEVER* be exported") + conf.Endpoints[i].APIKey = "" // make conf equal to confCopy to assert equality of other fields + } + assert.Equal(*conf, confCopy) // ensure all fields have been exported then parsed correctly +} diff --git a/pkg/trace/info/make.go b/pkg/trace/info/make.go new file mode 100644 index 0000000000000..c8addcdb3d7df --- /dev/null +++ b/pkg/trace/info/make.go @@ -0,0 +1,63 @@ +// +build ignore + +// The 'make' program is run by go generate to compile the versioning information +// into the info package. It expects the 'git' command to be installed. +package main + +import ( + "bytes" + "fmt" + "io/ioutil" + "log" + "os" + "os/exec" + "time" +) + +// runs runs the given command and returns the output. If it fails, +// or if the result is an empty string, it returns the fallback. +func run(fallback, name string, args ...string) string { + cmd := exec.Command(name, args...) + out, err := cmd.Output() + if err != nil || len(out) == 0 { + return fallback + } + return string(bytes.Trim(out, "\n")) +} + +func main() { + log.SetPrefix("make_version") + log.SetFlags(0) + + commit := run("master", "git", "rev-parse", "--short", "HEAD") + branch := run("master", "git", "rev-parse", "--abbrev-ref", "HEAD") + version := os.Getenv("TRACE_AGENT_VERSION") + if version == "" { + version = "0.99.0" + } + + output := fmt.Sprintf(template, version, commit, branch, time.Now().String()) + err := ioutil.WriteFile("git_version.go", []byte(output), 0664) + if err != nil { + log.Fatal(err) + } +} + +const template = ` +// Code generated by 'go run make.go'. DO NOT EDIT. + +package info + +import ( + "runtime" + "strings" +) + +func init() { + Version = %[1]q + GitCommit = %[2]q + GitBranch = %[3]q + BuildDate = %[4]q + GoVersion = strings.TrimPrefix(runtime.Version(), "go") +} +` diff --git a/pkg/trace/info/sampler.go b/pkg/trace/info/sampler.go new file mode 100644 index 0000000000000..7994d1ce5b78f --- /dev/null +++ b/pkg/trace/info/sampler.go @@ -0,0 +1,19 @@ +package info + +import "github.com/DataDog/datadog-agent/pkg/trace/sampler" + +// SamplerInfo represents internal stats and state of a sampler +type SamplerInfo struct { + // Stats contains statistics about what the sampler is doing. + Stats SamplerStats + // State is the internal state of the sampler (for debugging mostly) + State sampler.InternalState +} + +// SamplerStats contains sampler statistics +type SamplerStats struct { + // KeptTPS is the number of traces kept (average per second for last flush) + KeptTPS float64 + // TotalTPS is the total number of traces (average per second for last flush) + TotalTPS float64 +} diff --git a/pkg/trace/info/stats.go b/pkg/trace/info/stats.go new file mode 100644 index 0000000000000..c86dfaea0c3a3 --- /dev/null +++ b/pkg/trace/info/stats.go @@ -0,0 +1,284 @@ +package info + +import ( + "fmt" + "sort" + "sync" + "sync/atomic" + + "github.com/DataDog/datadog-agent/pkg/trace/metrics" +) + +// ReceiverStats is used to store all the stats per tags. +type ReceiverStats struct { + sync.RWMutex + Stats map[Tags]*TagStats +} + +// NewReceiverStats returns a new ReceiverStats +func NewReceiverStats() *ReceiverStats { + return &ReceiverStats{sync.RWMutex{}, map[Tags]*TagStats{}} +} + +// GetTagStats returns the struct in which the stats will be stored depending of their tags. +func (rs *ReceiverStats) GetTagStats(tags Tags) *TagStats { + rs.Lock() + tagStats, ok := rs.Stats[tags] + if !ok { + tagStats = newTagStats(tags) + rs.Stats[tags] = tagStats + } + rs.Unlock() + + return tagStats +} + +// Acc accumulates the stats from another ReceiverStats struct. +func (rs *ReceiverStats) Acc(recent *ReceiverStats) { + recent.Lock() + for _, tagStats := range recent.Stats { + ts := rs.GetTagStats(tagStats.Tags) + ts.update(&tagStats.Stats) + } + recent.Unlock() +} + +// Publish updates stats about per-tag stats +func (rs *ReceiverStats) Publish() { + rs.RLock() + for _, tagStats := range rs.Stats { + tagStats.publish() + } + rs.RUnlock() +} + +// Languages returns the set of languages reporting traces to the Agent. +func (rs *ReceiverStats) Languages() []string { + langSet := make(map[string]bool) + langs := []string{} + + rs.RLock() + for tags := range rs.Stats { + if _, ok := langSet[tags.Lang]; !ok { + langs = append(langs, tags.Lang) + langSet[tags.Lang] = true + } + } + rs.RUnlock() + + sort.Strings(langs) + + return langs +} + +// Reset resets the ReceiverStats internal data +func (rs *ReceiverStats) Reset() { + rs.Lock() + for key, tagStats := range rs.Stats { + // If a tagStats was empty, let's drop it. + // That's a way to avoid over-time leaks. + if tagStats.isEmpty() { + delete(rs.Stats, key) + } + tagStats.reset() + } + rs.Unlock() +} + +// Strings gives a multi strings representation of the ReceiverStats struct. +func (rs *ReceiverStats) Strings() []string { + rs.RLock() + defer rs.RUnlock() + + if len(rs.Stats) == 0 { + return []string{"no data received"} + } + + strings := make([]string, 0, len(rs.Stats)) + + for _, ts := range rs.Stats { + if !ts.isEmpty() { + strings = append(strings, fmt.Sprintf("%v -> %s", ts.Tags.toArray(), ts.String())) + } + } + return strings +} + +// TagStats is the struct used to associate the stats with their set of tags. +type TagStats struct { + Tags + Stats +} + +func newTagStats(tags Tags) *TagStats { + return &TagStats{tags, Stats{}} +} + +func (ts *TagStats) publish() { + // Atomically load the stats from ts + tracesReceived := atomic.LoadInt64(&ts.TracesReceived) + tracesDropped := atomic.LoadInt64(&ts.TracesDropped) + tracesFiltered := atomic.LoadInt64(&ts.TracesFiltered) + tracesPriorityNone := atomic.LoadInt64(&ts.TracesPriorityNone) + tracesPriorityNeg := atomic.LoadInt64(&ts.TracesPriorityNeg) + tracesPriority0 := atomic.LoadInt64(&ts.TracesPriority0) + tracesPriority1 := atomic.LoadInt64(&ts.TracesPriority1) + tracesPriority2 := atomic.LoadInt64(&ts.TracesPriority2) + tracesBytes := atomic.LoadInt64(&ts.TracesBytes) + spansReceived := atomic.LoadInt64(&ts.SpansReceived) + spansDropped := atomic.LoadInt64(&ts.SpansDropped) + spansFiltered := atomic.LoadInt64(&ts.SpansFiltered) + servicesReceived := atomic.LoadInt64(&ts.ServicesReceived) + servicesBytes := atomic.LoadInt64(&ts.ServicesBytes) + eventsExtracted := atomic.LoadInt64(&ts.EventsExtracted) + eventsSampled := atomic.LoadInt64(&ts.EventsSampled) + + // Publish the stats + tags := ts.Tags.toArray() + + metrics.Count("datadog.trace_agent.receiver.trace", tracesReceived, tags, 1) + metrics.Count("datadog.trace_agent.receiver.traces_received", tracesReceived, tags, 1) + metrics.Count("datadog.trace_agent.receiver.traces_dropped", tracesDropped, tags, 1) + metrics.Count("datadog.trace_agent.receiver.traces_filtered", tracesFiltered, tags, 1) + metrics.Count("datadog.trace_agent.receiver.traces_priority", tracesPriorityNone, append(tags, "priority:none"), 1) + metrics.Count("datadog.trace_agent.receiver.traces_priority", tracesPriorityNeg, append(tags, "priority:neg"), 1) + metrics.Count("datadog.trace_agent.receiver.traces_priority", tracesPriority0, append(tags, "priority:0"), 1) + metrics.Count("datadog.trace_agent.receiver.traces_priority", tracesPriority1, append(tags, "priority:1"), 1) + metrics.Count("datadog.trace_agent.receiver.traces_priority", tracesPriority2, append(tags, "priority:2"), 1) + metrics.Count("datadog.trace_agent.receiver.traces_bytes", tracesBytes, tags, 1) + metrics.Count("datadog.trace_agent.receiver.spans_received", spansReceived, tags, 1) + metrics.Count("datadog.trace_agent.receiver.spans_dropped", spansDropped, tags, 1) + metrics.Count("datadog.trace_agent.receiver.spans_filtered", spansFiltered, tags, 1) + metrics.Count("datadog.trace_agent.receiver.services_received", servicesReceived, tags, 1) + metrics.Count("datadog.trace_agent.receiver.services_bytes", servicesBytes, tags, 1) + metrics.Count("datadog.trace_agent.receiver.events_extracted", eventsExtracted, tags, 1) + metrics.Count("datadog.trace_agent.receiver.events_sampled", eventsSampled, tags, 1) +} + +// Stats holds the metrics that will be reported every 10s by the agent. +// Its fields require to be accessed in an atomic way. +type Stats struct { + // TracesReceived is the total number of traces received, including the dropped ones. + TracesReceived int64 + // TracesDropped is the number of traces dropped. + TracesDropped int64 + // TracesFiltered is the number of traces filtered. + TracesFiltered int64 + // TracesPriorityNone is the number of traces with no sampling priority. + TracesPriorityNone int64 + // TracesPriorityNeg is the number of traces with a negative sampling priority. + TracesPriorityNeg int64 + // TracesPriority0 is the number of traces with sampling priority set to zero. + TracesPriority0 int64 + // TracesPriority1 is the number of traces with sampling priority automatically set to 1. + TracesPriority1 int64 + // TracesPriority2 is the number of traces with sampling priority manually set to 2 or more. + TracesPriority2 int64 + // TracesBytes is the amount of data received on the traces endpoint (raw data, encoded, compressed). + TracesBytes int64 + // SpansReceived is the total number of spans received, including the dropped ones. + SpansReceived int64 + // SpansDropped is the number of spans dropped. + SpansDropped int64 + // SpansFiltered is the number of spans filtered. + SpansFiltered int64 + // ServicesReceived is the number of services received. + ServicesReceived int64 + // ServicesBytes is the amount of data received on the services endpoint (raw data, encoded, compressed). + ServicesBytes int64 + // EventsExtracted is the total number of APM events extracted from traces. + EventsExtracted int64 + // EventsSampled is the total number of APM events sampled. + EventsSampled int64 +} + +func (s *Stats) update(recent *Stats) { + atomic.AddInt64(&s.TracesReceived, atomic.LoadInt64(&recent.TracesReceived)) + atomic.AddInt64(&s.TracesDropped, atomic.LoadInt64(&recent.TracesDropped)) + atomic.AddInt64(&s.TracesFiltered, atomic.LoadInt64(&recent.TracesFiltered)) + atomic.AddInt64(&s.TracesPriorityNone, atomic.LoadInt64(&recent.TracesPriorityNone)) + atomic.AddInt64(&s.TracesPriorityNeg, atomic.LoadInt64(&recent.TracesPriorityNeg)) + atomic.AddInt64(&s.TracesPriority0, atomic.LoadInt64(&recent.TracesPriority0)) + atomic.AddInt64(&s.TracesPriority1, atomic.LoadInt64(&recent.TracesPriority1)) + atomic.AddInt64(&s.TracesPriority2, atomic.LoadInt64(&recent.TracesPriority2)) + atomic.AddInt64(&s.TracesBytes, atomic.LoadInt64(&recent.TracesBytes)) + atomic.AddInt64(&s.SpansReceived, atomic.LoadInt64(&recent.SpansReceived)) + atomic.AddInt64(&s.SpansDropped, atomic.LoadInt64(&recent.SpansDropped)) + atomic.AddInt64(&s.SpansFiltered, atomic.LoadInt64(&recent.SpansFiltered)) + atomic.AddInt64(&s.ServicesReceived, atomic.LoadInt64(&recent.ServicesReceived)) + atomic.AddInt64(&s.ServicesBytes, atomic.LoadInt64(&recent.ServicesBytes)) + atomic.AddInt64(&s.EventsExtracted, atomic.LoadInt64(&recent.EventsExtracted)) + atomic.AddInt64(&s.EventsSampled, atomic.LoadInt64(&recent.EventsSampled)) +} + +func (s *Stats) reset() { + atomic.StoreInt64(&s.TracesReceived, 0) + atomic.StoreInt64(&s.TracesDropped, 0) + atomic.StoreInt64(&s.TracesFiltered, 0) + atomic.StoreInt64(&s.TracesPriorityNone, 0) + atomic.StoreInt64(&s.TracesPriorityNeg, 0) + atomic.StoreInt64(&s.TracesPriority0, 0) + atomic.StoreInt64(&s.TracesPriority1, 0) + atomic.StoreInt64(&s.TracesPriority2, 0) + atomic.StoreInt64(&s.TracesBytes, 0) + atomic.StoreInt64(&s.SpansReceived, 0) + atomic.StoreInt64(&s.SpansDropped, 0) + atomic.StoreInt64(&s.SpansFiltered, 0) + atomic.StoreInt64(&s.ServicesReceived, 0) + atomic.StoreInt64(&s.ServicesBytes, 0) + atomic.StoreInt64(&s.EventsExtracted, 0) + atomic.StoreInt64(&s.EventsSampled, 0) +} + +func (s *Stats) isEmpty() bool { + tracesBytes := atomic.LoadInt64(&s.TracesBytes) + + return tracesBytes == 0 +} + +// String returns a string representation of the Stats struct +func (s *Stats) String() string { + // Atomically load the stats + tracesReceived := atomic.LoadInt64(&s.TracesReceived) + tracesDropped := atomic.LoadInt64(&s.TracesDropped) + tracesFiltered := atomic.LoadInt64(&s.TracesFiltered) + // Omitting priority information, use expvar or metrics for debugging purpose + tracesBytes := atomic.LoadInt64(&s.TracesBytes) + servicesReceived := atomic.LoadInt64(&s.ServicesReceived) + servicesBytes := atomic.LoadInt64(&s.ServicesBytes) + eventsExtracted := atomic.LoadInt64(&s.EventsExtracted) + eventsSampled := atomic.LoadInt64(&s.EventsSampled) + + return fmt.Sprintf("traces received: %d, traces dropped: %d, traces filtered: %d, "+ + "traces amount: %d bytes, services received: %d, services amount: %d bytes, "+ + "events extracted: %d, events sampled: %d", + tracesReceived, tracesDropped, tracesFiltered, + tracesBytes, servicesReceived, servicesBytes, + eventsExtracted, eventsSampled) +} + +// Tags holds the tags we parse when we handle the header of the payload. +type Tags struct { + Lang, LangVersion, Interpreter, TracerVersion string +} + +// toArray will transform the Tags struct into a slice of string. +// We only publish the non-empty tags. +func (t *Tags) toArray() []string { + tags := make([]string, 0, 5) + + if t.Lang != "" { + tags = append(tags, "lang:"+t.Lang) + } + if t.LangVersion != "" { + tags = append(tags, "lang_version:"+t.LangVersion) + } + if t.Interpreter != "" { + tags = append(tags, "interpreter:"+t.Interpreter) + } + if t.TracerVersion != "" { + tags = append(tags, "tracer_version:"+t.TracerVersion) + } + + return tags +} diff --git a/pkg/trace/info/testdata/okay.info b/pkg/trace/info/testdata/okay.info new file mode 100644 index 0000000000000..71e3d1bb4d34f --- /dev/null +++ b/pkg/trace/info/testdata/okay.info @@ -0,0 +1,28 @@ +====================== +Trace Agent (v 0.99.0) +====================== + + Pid: 38149 + Uptime: 15 seconds + Mem alloc: 773552 bytes + + Hostname: localhost.localdomain + Receiver: localhost:8126 + Endpoints: + https://trace1.agent.datadoghq.com + https://trace2.agent.datadoghq.com + + --- Receiver stats (1 min) --- + + From unknown clients + Traces received: 0 (0 bytes) + Spans received: 0 + Services received: 0 (0 bytes) + + Priority sampling rate for 'service:myapp,env:dev': 12.3 % + + --- Writer stats (1 min) --- + + Traces: 4 payloads, 26 traces, 123 events, 3245 bytes + Stats: 6 payloads, 12 stats buckets, 8329 bytes + Services: 1 payloads, 2 services, 1234 bytes diff --git a/pkg/trace/info/testdata/okay.json b/pkg/trace/info/testdata/okay.json new file mode 100644 index 0000000000000..1d0f8801c9f7b --- /dev/null +++ b/pkg/trace/info/testdata/okay.json @@ -0,0 +1,14 @@ +{ + "cmdline": ["./trace-agent"], + "config": {"Enabled":true,"Hostname":"localhost.localdomain","DefaultEnv":"none","Endpoints":[{"Host": "https://trace1.agent.datadoghq.com"}, {"Host": "https://trace2.agent.datadoghq.com"}],"APIPayloadBufferMaxSize":16777216,"BucketInterval":10000000000,"ExtraAggregators":[],"ExtraSampleRate":1,"MaxTPS":10,"ReceiverHost":"localhost","ReceiverPort":8126,"ConnectionLimit":2000,"ReceiverTimeout":0,"StatsdHost":"127.0.0.1","StatsdPort":8125,"LogLevel":"INFO","LogFilePath":"/var/log/datadog/trace-agent.log"}, + "trace_writer": {"Payloads":4,"Bytes":3245,"Traces":26,"Events":123,"Errors":0}, + "stats_writer": {"Payloads":6,"Bytes":8329,"StatsBuckets":12,"Errors":0}, + "service_writer": {"Payloads":1,"Bytes":1234,"Services":2,"Errors":0}, + "memstats": {"Alloc":773552,"TotalAlloc":773552,"Sys":3346432,"Lookups":6,"Mallocs":7231,"Frees":561,"HeapAlloc":773552,"HeapSys":1572864,"HeapIdle":49152,"HeapInuse":1523712,"HeapReleased":0,"HeapObjects":6670,"StackInuse":524288,"StackSys":524288,"MSpanInuse":24480,"MSpanSys":32768,"MCacheInuse":4800,"MCacheSys":16384,"BuckHashSys":2675,"GCSys":131072,"OtherSys":1066381,"NextGC":4194304,"LastGC":0,"PauseTotalNs":0,"PauseNs":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"PauseEnd":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"NumGC":0,"GCCPUFraction":0,"EnableGC":true,"DebugGC":false,"BySize":[{"Size":0,"Mallocs":0,"Frees":0},{"Size":8,"Mallocs":126,"Frees":0},{"Size":16,"Mallocs":825,"Frees":0},{"Size":32,"Mallocs":4208,"Frees":0},{"Size":48,"Mallocs":345,"Frees":0},{"Size":64,"Mallocs":262,"Frees":0},{"Size":80,"Mallocs":93,"Frees":0},{"Size":96,"Mallocs":70,"Frees":0},{"Size":112,"Mallocs":97,"Frees":0},{"Size":128,"Mallocs":24,"Frees":0},{"Size":144,"Mallocs":25,"Frees":0},{"Size":160,"Mallocs":57,"Frees":0},{"Size":176,"Mallocs":128,"Frees":0},{"Size":192,"Mallocs":13,"Frees":0},{"Size":208,"Mallocs":77,"Frees":0},{"Size":224,"Mallocs":3,"Frees":0},{"Size":240,"Mallocs":2,"Frees":0},{"Size":256,"Mallocs":17,"Frees":0},{"Size":288,"Mallocs":64,"Frees":0},{"Size":320,"Mallocs":12,"Frees":0},{"Size":352,"Mallocs":20,"Frees":0},{"Size":384,"Mallocs":1,"Frees":0},{"Size":416,"Mallocs":59,"Frees":0},{"Size":448,"Mallocs":0,"Frees":0},{"Size":480,"Mallocs":3,"Frees":0},{"Size":512,"Mallocs":2,"Frees":0},{"Size":576,"Mallocs":17,"Frees":0},{"Size":640,"Mallocs":6,"Frees":0},{"Size":704,"Mallocs":10,"Frees":0},{"Size":768,"Mallocs":0,"Frees":0},{"Size":896,"Mallocs":11,"Frees":0},{"Size":1024,"Mallocs":11,"Frees":0},{"Size":1152,"Mallocs":12,"Frees":0},{"Size":1280,"Mallocs":2,"Frees":0},{"Size":1408,"Mallocs":2,"Frees":0},{"Size":1536,"Mallocs":0,"Frees":0},{"Size":1664,"Mallocs":10,"Frees":0},{"Size":2048,"Mallocs":17,"Frees":0},{"Size":2304,"Mallocs":7,"Frees":0},{"Size":2560,"Mallocs":1,"Frees":0},{"Size":2816,"Mallocs":1,"Frees":0},{"Size":3072,"Mallocs":1,"Frees":0},{"Size":3328,"Mallocs":7,"Frees":0},{"Size":4096,"Mallocs":4,"Frees":0},{"Size":4608,"Mallocs":1,"Frees":0},{"Size":5376,"Mallocs":6,"Frees":0},{"Size":6144,"Mallocs":4,"Frees":0},{"Size":6400,"Mallocs":0,"Frees":0},{"Size":6656,"Mallocs":1,"Frees":0},{"Size":6912,"Mallocs":0,"Frees":0},{"Size":8192,"Mallocs":0,"Frees":0},{"Size":8448,"Mallocs":0,"Frees":0},{"Size":8704,"Mallocs":1,"Frees":0},{"Size":9472,"Mallocs":0,"Frees":0},{"Size":10496,"Mallocs":0,"Frees":0},{"Size":12288,"Mallocs":1,"Frees":0},{"Size":13568,"Mallocs":0,"Frees":0},{"Size":14080,"Mallocs":0,"Frees":0},{"Size":16384,"Mallocs":0,"Frees":0},{"Size":16640,"Mallocs":0,"Frees":0},{"Size":17664,"Mallocs":1,"Frees":0}]}, + "pid": 38149, + "ratebyservice": {"service:,env:":1,"service:myapp,env:dev":0.123}, + "receiver": [{}], + "presampler": {"Rate":1.0}, + "uptime": 15, + "version": {"BuildDate": "2017-02-01T14:28:10+0100", "GitBranch": "ufoot/statusinfo", "GitCommit": "396a217", "GoVersion": "go version go1.7 darwin/amd64", "Version": "0.99.0"} +} diff --git a/pkg/trace/info/testdata/warning.info b/pkg/trace/info/testdata/warning.info new file mode 100644 index 0000000000000..4ee489a5dffea --- /dev/null +++ b/pkg/trace/info/testdata/warning.info @@ -0,0 +1,33 @@ +====================== +Trace Agent (v 0.99.0) +====================== + + Pid: 38149 + Uptime: 15 seconds + Mem alloc: 773552 bytes + + Hostname: localhost.localdomain + Receiver: localhost:8126 + Endpoints: + https://trace.agent.datadoghq.com + + --- Receiver stats (1 min) --- + + From python 2.7.6 (CPython), client 0.9.0 + Traces received: 70 (10679 bytes) + Spans received: 984 + Services received: 0 (0 bytes) + WARNING: Traces dropped: 23 + WARNING: Spans dropped: 184 + + WARNING: Pre-sampling traces: 42.1 % + WARNING: Pre-sampler: raising pre-sampling rate from 3.1 % to 5.0 % + + --- Writer stats (1 min) --- + + Traces: 4 payloads, 26 traces, 3245 bytes + WARNING: Traces API errors (1 min): 3 + Stats: 6 payloads, 12 stats buckets, 8329 bytes + WARNING: Stats API errors (1 min): 1 + Services: 1 payloads, 2 services, 1234 bytes + WARNING: Services API errors (1 min): 1 diff --git a/pkg/trace/info/testdata/warning.json b/pkg/trace/info/testdata/warning.json new file mode 100644 index 0000000000000..44f832c781430 --- /dev/null +++ b/pkg/trace/info/testdata/warning.json @@ -0,0 +1,13 @@ +{ + "cmdline": ["./trace-agent"], + "config": {"Enabled":true,"Hostname":"localhost.localdomain","DefaultEnv":"none","Endpoints":[{"Host": "https://trace.agent.datadoghq.com"}],"APIPayloadBufferMaxSize":16777216,"BucketInterval":10000000000,"ExtraAggregators":[],"ExtraSampleRate":1,"MaxTPS":10,"ReceiverHost":"localhost","ReceiverPort":8126,"ConnectionLimit":2000,"ReceiverTimeout":0,"StatsdHost":"127.0.0.1","StatsdPort":8125,"LogLevel":"INFO","LogFilePath":"/var/log/datadog/trace-agent.log"}, + "trace_writer": {"Payloads":4,"Bytes":3245,"Traces":26,"Errors":3}, + "stats_writer": {"Payloads":6,"Bytes":8329,"StatsBuckets":12,"Errors":1}, + "service_writer": {"Payloads":1,"Bytes":1234,"Services":2,"Errors":1}, + "memstats": {"Alloc":773552,"TotalAlloc":773552,"Sys":3346432,"Lookups":6,"Mallocs":7231,"Frees":561,"HeapAlloc":773552,"HeapSys":1572864,"HeapIdle":49152,"HeapInuse":1523712,"HeapReleased":0,"HeapObjects":6670,"StackInuse":524288,"StackSys":524288,"MSpanInuse":24480,"MSpanSys":32768,"MCacheInuse":4800,"MCacheSys":16384,"BuckHashSys":2675,"GCSys":131072,"OtherSys":1066381,"NextGC":4194304,"LastGC":0,"PauseTotalNs":0,"PauseNs":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"PauseEnd":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"NumGC":0,"GCCPUFraction":0,"EnableGC":true,"DebugGC":false,"BySize":[{"Size":0,"Mallocs":0,"Frees":0},{"Size":8,"Mallocs":126,"Frees":0},{"Size":16,"Mallocs":825,"Frees":0},{"Size":32,"Mallocs":4208,"Frees":0},{"Size":48,"Mallocs":345,"Frees":0},{"Size":64,"Mallocs":262,"Frees":0},{"Size":80,"Mallocs":93,"Frees":0},{"Size":96,"Mallocs":70,"Frees":0},{"Size":112,"Mallocs":97,"Frees":0},{"Size":128,"Mallocs":24,"Frees":0},{"Size":144,"Mallocs":25,"Frees":0},{"Size":160,"Mallocs":57,"Frees":0},{"Size":176,"Mallocs":128,"Frees":0},{"Size":192,"Mallocs":13,"Frees":0},{"Size":208,"Mallocs":77,"Frees":0},{"Size":224,"Mallocs":3,"Frees":0},{"Size":240,"Mallocs":2,"Frees":0},{"Size":256,"Mallocs":17,"Frees":0},{"Size":288,"Mallocs":64,"Frees":0},{"Size":320,"Mallocs":12,"Frees":0},{"Size":352,"Mallocs":20,"Frees":0},{"Size":384,"Mallocs":1,"Frees":0},{"Size":416,"Mallocs":59,"Frees":0},{"Size":448,"Mallocs":0,"Frees":0},{"Size":480,"Mallocs":3,"Frees":0},{"Size":512,"Mallocs":2,"Frees":0},{"Size":576,"Mallocs":17,"Frees":0},{"Size":640,"Mallocs":6,"Frees":0},{"Size":704,"Mallocs":10,"Frees":0},{"Size":768,"Mallocs":0,"Frees":0},{"Size":896,"Mallocs":11,"Frees":0},{"Size":1024,"Mallocs":11,"Frees":0},{"Size":1152,"Mallocs":12,"Frees":0},{"Size":1280,"Mallocs":2,"Frees":0},{"Size":1408,"Mallocs":2,"Frees":0},{"Size":1536,"Mallocs":0,"Frees":0},{"Size":1664,"Mallocs":10,"Frees":0},{"Size":2048,"Mallocs":17,"Frees":0},{"Size":2304,"Mallocs":7,"Frees":0},{"Size":2560,"Mallocs":1,"Frees":0},{"Size":2816,"Mallocs":1,"Frees":0},{"Size":3072,"Mallocs":1,"Frees":0},{"Size":3328,"Mallocs":7,"Frees":0},{"Size":4096,"Mallocs":4,"Frees":0},{"Size":4608,"Mallocs":1,"Frees":0},{"Size":5376,"Mallocs":6,"Frees":0},{"Size":6144,"Mallocs":4,"Frees":0},{"Size":6400,"Mallocs":0,"Frees":0},{"Size":6656,"Mallocs":1,"Frees":0},{"Size":6912,"Mallocs":0,"Frees":0},{"Size":8192,"Mallocs":0,"Frees":0},{"Size":8448,"Mallocs":0,"Frees":0},{"Size":8704,"Mallocs":1,"Frees":0},{"Size":9472,"Mallocs":0,"Frees":0},{"Size":10496,"Mallocs":0,"Frees":0},{"Size":12288,"Mallocs":1,"Frees":0},{"Size":13568,"Mallocs":0,"Frees":0},{"Size":14080,"Mallocs":0,"Frees":0},{"Size":16384,"Mallocs":0,"Frees":0},{"Size":16640,"Mallocs":0,"Frees":0},{"Size":17664,"Mallocs":1,"Frees":0}]}, + "pid": 38149, + "receiver": [{"Lang":"python","LangVersion":"2.7.6","Interpreter":"CPython","TracerVersion":"0.9.0","TracesReceived":70,"TracesDropped":23,"TracesBytes":10679,"SpansReceived":984,"SpansDropped":184,"ServicesReceived":0,"ServicesBytes":0}], + "presampler": {"Rate":0.421,"Error":"raising pre-sampling rate from 3.1 % to 5.0 %"}, + "uptime": 15, + "version": {"BuildDate": "2017-02-01T14:28:10+0100", "GitBranch": "ufoot/statusinfo", "GitCommit": "396a217", "GoVersion": "go version go1.7 darwin/amd64", "Version": "0.99.0"} +} diff --git a/pkg/trace/info/version.go b/pkg/trace/info/version.go new file mode 100644 index 0000000000000..2a140f4f16481 --- /dev/null +++ b/pkg/trace/info/version.go @@ -0,0 +1,58 @@ +//go:generate go run make.go + +package info + +import ( + "bytes" + "fmt" +) + +// version info sourced from build flags +var ( + Version string + GitCommit string + GitBranch string + BuildDate string + GoVersion string +) + +// VersionString returns the version information filled in at build time +func VersionString() string { + var buf bytes.Buffer + + if Version != "" { + fmt.Fprintf(&buf, "Version: %s\n", Version) + } + if GitCommit != "" { + fmt.Fprintf(&buf, "Git hash: %s\n", GitCommit) + } + if GitBranch != "" { + fmt.Fprintf(&buf, "Git branch: %s\n", GitBranch) + } + if BuildDate != "" { + fmt.Fprintf(&buf, "Build date: %s\n", BuildDate) + } + if GoVersion != "" { + fmt.Fprintf(&buf, "Go Version: %s\n", GoVersion) + } + + return buf.String() +} + +type infoVersion struct { + Version string + GitCommit string + GitBranch string + BuildDate string + GoVersion string +} + +func publishVersion() interface{} { + return infoVersion{ + Version: Version, + GitCommit: GitCommit, + GitBranch: GitBranch, + BuildDate: BuildDate, + GoVersion: GoVersion, + } +} diff --git a/pkg/trace/info/writer.go b/pkg/trace/info/writer.go new file mode 100644 index 0000000000000..636d21c377085 --- /dev/null +++ b/pkg/trace/info/writer.go @@ -0,0 +1,71 @@ +package info + +// TraceWriterInfo represents statistics from the trace writer. +type TraceWriterInfo struct { + Payloads int64 + Traces int64 + Events int64 + Spans int64 + Errors int64 + Retries int64 + Bytes int64 + SingleMaxSpans int64 +} + +// ServiceWriterInfo represents statistics from the service writer. +type ServiceWriterInfo struct { + Payloads int64 + Services int64 + Errors int64 + Retries int64 + Bytes int64 +} + +// StatsWriterInfo represents statistics from the stats writer. +type StatsWriterInfo struct { + Payloads int64 + StatsBuckets int64 + Errors int64 + Retries int64 + Splits int64 + Bytes int64 +} + +// UpdateTraceWriterInfo updates internal trace writer stats +func UpdateTraceWriterInfo(tws TraceWriterInfo) { + infoMu.Lock() + defer infoMu.Unlock() + traceWriterInfo = tws +} + +func publishTraceWriterInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return traceWriterInfo +} + +// UpdateStatsWriterInfo updates internal stats writer stats +func UpdateStatsWriterInfo(sws StatsWriterInfo) { + infoMu.Lock() + defer infoMu.Unlock() + statsWriterInfo = sws +} + +func publishStatsWriterInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return statsWriterInfo +} + +// UpdateServiceWriterInfo updates internal service writer stats +func UpdateServiceWriterInfo(sws ServiceWriterInfo) { + infoMu.Lock() + defer infoMu.Unlock() + serviceWriterInfo = sws +} + +func publishServiceWriterInfo() interface{} { + infoMu.RLock() + defer infoMu.RUnlock() + return serviceWriterInfo +} diff --git a/pkg/trace/metrics/metrics.go b/pkg/trace/metrics/metrics.go new file mode 100644 index 0000000000000..74e725b194fb7 --- /dev/null +++ b/pkg/trace/metrics/metrics.go @@ -0,0 +1,56 @@ +// Package metrics exposes utilities for setting up and using a sub-set of Datadog's dogstatsd +// client. +package metrics + +import ( + "fmt" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-go/statsd" +) + +// StatsClient represents a client capable of sending stats to some stat endpoint. +type StatsClient interface { + Gauge(name string, value float64, tags []string, rate float64) error + Count(name string, value int64, tags []string, rate float64) error + Histogram(name string, value float64, tags []string, rate float64) error +} + +// Client is a global Statsd client. When a client is configured via Configure, +// that becomes the new global Statsd client in the package. +var Client StatsClient = (*statsd.Client)(nil) + +// Gauge calls Gauge on the global Client, if set. +func Gauge(name string, value float64, tags []string, rate float64) error { + if Client == nil { + return nil // no-op + } + return Client.Gauge(name, value, tags, rate) +} + +// Count calls Count on the global Client, if set. +func Count(name string, value int64, tags []string, rate float64) error { + if Client == nil { + return nil // no-op + } + return Client.Count(name, value, tags, rate) +} + +// Histogram calls Histogram on the global Client, if set. +func Histogram(name string, value float64, tags []string, rate float64) error { + if Client == nil { + return nil // no-op + } + return Client.Histogram(name, value, tags, rate) +} + +// Configure creates a statsd client for the given agent's configuration, using the specified global tags. +func Configure(conf *config.AgentConfig, tags []string) error { + client, err := statsd.New(fmt.Sprintf("%s:%d", conf.StatsdHost, conf.StatsdPort)) + if err != nil { + return err + } + client.Tags = tags + Client = client + return nil +} diff --git a/pkg/trace/obfuscate/http.go b/pkg/trace/obfuscate/http.go new file mode 100644 index 0000000000000..4e17529aca4cd --- /dev/null +++ b/pkg/trace/obfuscate/http.go @@ -0,0 +1,56 @@ +package obfuscate + +import ( + "net/url" + "strings" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// obfuscateHTTP obfuscates query strings and path segments containing digits in the span's +// "http.url" tag, when one or both of these options are enabled. +func (o *Obfuscator) obfuscateHTTP(span *pb.Span) { + if span.Meta == nil { + return + } + if !o.opts.HTTP.RemoveQueryString && !o.opts.HTTP.RemovePathDigits { + // nothing to do + return + } + const k = "http.url" + val, ok := span.Meta[k] + if !ok { + return + } + u, err := url.Parse(val) + if err != nil { + // should not happen for valid URLs, but better obfuscate everything + // rather than expose sensitive information when this option is on. + span.Meta[k] = "?" + return + } + if o.opts.HTTP.RemoveQueryString && u.RawQuery != "" { + u.ForceQuery = true // add the '?' + u.RawQuery = "" + } + if o.opts.HTTP.RemovePathDigits { + segs := strings.Split(u.Path, "/") + var changed bool + for i, seg := range segs { + for _, ch := range []byte(seg) { + if ch >= '0' && ch <= '9' { + // we can not set the question mark directly here because the url + // package will escape it into %3F, so we use this placeholder and + // replace it further down. + segs[i] = "/REDACTED/" + changed = true + break + } + } + } + if changed { + u.Path = strings.Join(segs, "/") + } + } + span.Meta[k] = strings.Replace(u.String(), "/REDACTED/", "?", -1) +} diff --git a/pkg/trace/obfuscate/http_test.go b/pkg/trace/obfuscate/http_test.go new file mode 100644 index 0000000000000..016e4bfedc55b --- /dev/null +++ b/pkg/trace/obfuscate/http_test.go @@ -0,0 +1,159 @@ +package obfuscate + +import ( + "strconv" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +// inOutTest is holds a test input and an expected output. +type inOutTest struct{ in, out string } + +func TestObfuscateHTTP(t *testing.T) { + const testURL = "http://foo.com/1/2/3?q=james" + + t.Run("disabled", testHTTPObfuscation(&inOutTest{ + in: testURL, + out: testURL, + }, nil)) + + t.Run("query", func(t *testing.T) { + conf := &config.HTTPObfuscationConfig{RemoveQueryString: true} + for ti, tt := range []inOutTest{ + { + in: "http://foo.com/", + out: "http://foo.com/", + }, + { + in: "http://foo.com/123", + out: "http://foo.com/123", + }, + { + in: "http://foo.com/id/123/page/1?search=bar&page=2", + out: "http://foo.com/id/123/page/1?", + }, + { + in: "http://foo.com/id/123/page/1?search=bar&page=2#fragment", + out: "http://foo.com/id/123/page/1?#fragment", + }, + { + in: "http://foo.com/id/123/page/1?blabla", + out: "http://foo.com/id/123/page/1?", + }, + { + in: "http://foo.com/id/123/pa%3Fge/1?blabla", + out: "http://foo.com/id/123/pa%3Fge/1?", + }, + } { + t.Run(strconv.Itoa(ti), testHTTPObfuscation(&tt, conf)) + } + }) + + t.Run("digits", func(t *testing.T) { + conf := &config.HTTPObfuscationConfig{RemovePathDigits: true} + for ti, tt := range []inOutTest{ + { + in: "http://foo.com/", + out: "http://foo.com/", + }, + { + in: "http://foo.com/name?query=search", + out: "http://foo.com/name?query=search", + }, + { + in: "http://foo.com/id/123/page/1?search=bar&page=2", + out: "http://foo.com/id/?/page/??search=bar&page=2", + }, + { + in: "http://foo.com/id/a1/page/1qwe233?search=bar&page=2#fragment-123", + out: "http://foo.com/id/?/page/??search=bar&page=2#fragment-123", + }, + { + in: "http://foo.com/123", + out: "http://foo.com/?", + }, + { + in: "http://foo.com/123/abcd9", + out: "http://foo.com/?/?", + }, + { + in: "http://foo.com/123/name/abcd9", + out: "http://foo.com/?/name/?", + }, + { + in: "http://foo.com/123/name/abcd9", + out: "http://foo.com/?/name/?", + }, + { + in: "http://foo.com/1%3F3/nam%3Fe/abcd9", + out: "http://foo.com/?/nam%3Fe/?", + }, + } { + t.Run(strconv.Itoa(ti), testHTTPObfuscation(&tt, conf)) + } + }) + + t.Run("both", func(t *testing.T) { + conf := &config.HTTPObfuscationConfig{RemoveQueryString: true, RemovePathDigits: true} + for ti, tt := range []inOutTest{ + { + in: "http://foo.com/", + out: "http://foo.com/", + }, + { + in: "http://foo.com/name/id", + out: "http://foo.com/name/id", + }, + { + in: "http://foo.com/name/id?query=search", + out: "http://foo.com/name/id?", + }, + { + in: "http://foo.com/id/123/page/1?search=bar&page=2", + out: "http://foo.com/id/?/page/??", + }, + { + in: "http://foo.com/id/123/page/1?search=bar&page=2#fragment", + out: "http://foo.com/id/?/page/??#fragment", + }, + { + in: "http://foo.com/1%3F3/nam%3Fe/abcd9", + out: "http://foo.com/?/nam%3Fe/?", + }, + { + in: "http://foo.com/id/123/pa%3Fge/1?blabla", + out: "http://foo.com/id/?/pa%3Fge/??", + }, + } { + t.Run(strconv.Itoa(ti), testHTTPObfuscation(&tt, conf)) + } + }) + + t.Run("wrong-type", func(t *testing.T) { + assert := assert.New(t) + span := pb.Span{Type: "web_server", Meta: map[string]string{"http.url": testURL}} + cfg := config.HTTPObfuscationConfig{RemoveQueryString: true, RemovePathDigits: true} + NewObfuscator(&config.ObfuscationConfig{HTTP: cfg}).Obfuscate(&span) + assert.Equal(testURL, span.Meta["http.url"]) + }) +} + +// testHTTPObfuscation tests that the given input results in the given output using the passed configuration. +func testHTTPObfuscation(tt *inOutTest, conf *config.HTTPObfuscationConfig) func(t *testing.T) { + return func(t *testing.T) { + var cfg config.HTTPObfuscationConfig + if conf != nil { + cfg = *conf + } + assert := assert.New(t) + span := pb.Span{ + Type: "http", + Meta: map[string]string{"http.url": tt.in}, + } + NewObfuscator(&config.ObfuscationConfig{HTTP: cfg}).Obfuscate(&span) + assert.Equal(tt.out, span.Meta["http.url"]) + } +} diff --git a/pkg/trace/obfuscate/json.go b/pkg/trace/obfuscate/json.go new file mode 100644 index 0000000000000..c2376b77cbdd3 --- /dev/null +++ b/pkg/trace/obfuscate/json.go @@ -0,0 +1,150 @@ +package obfuscate + +import ( + "strings" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// obfuscateJSON obfuscates the given span's tag using the given obfuscator. If the obfuscator is +// nil it is considered disabled. +func (o *Obfuscator) obfuscateJSON(span *pb.Span, tag string, obfuscator *jsonObfuscator) { + if obfuscator == nil || span.Meta == nil || span.Meta[tag] == "" { + // obfuscator is disabled or tag is not present + return + } + span.Meta[tag], _ = obfuscator.obfuscate([]byte(span.Meta[tag])) + // we should accept whatever the obfuscator returns, even if it's an error: a parsing + // error simply means that the JSON was invalid, meaning that we've only obfuscated + // as much of it as we could. It is safe to accept the output, even if partial. +} + +type jsonObfuscator struct { + keepers map[string]bool // these keys will not be obfuscated + + scan *scanner // scanner + closures []bool // closure stack, true if object (e.g. {[{ => []bool{true, false, true}) + key bool // true if scanning a key + + wiped bool // true if obfuscation string (`"?"`) was already written for current value + keeping bool // true if not obfuscating + keepDepth int // the depth at which we've stopped obfuscating +} + +func newJSONObfuscator(cfg *config.JSONObfuscationConfig) *jsonObfuscator { + keepValue := make(map[string]bool, len(cfg.KeepValues)) + for _, v := range cfg.KeepValues { + keepValue[v] = true + } + return &jsonObfuscator{ + closures: []bool{}, + keepers: keepValue, + scan: &scanner{}, + } +} + +// setKey verifies if we are currently scanning a key based on the current state +// and updates the state accordingly. It must be called only after a closure or a +// value scan has ended. +func (p *jsonObfuscator) setKey() { + n := len(p.closures) + p.key = n == 0 || p.closures[n-1] // true if we are at top level or in an object + p.wiped = false +} + +func (p *jsonObfuscator) obfuscate(data []byte) (string, error) { + var out strings.Builder + buf := make([]byte, 0, 10) // recording key token + p.scan.reset() + for _, c := range data { + p.scan.bytes++ + op := p.scan.step(p.scan, c) + depth := len(p.closures) + switch op { + case scanBeginObject: + // object begins: { + p.closures = append(p.closures, true) + p.setKey() + + case scanBeginArray: + // array begins: [ + p.closures = append(p.closures, false) + p.setKey() + + case scanEndArray, scanEndObject: + // array or object closing + if n := len(p.closures) - 1; n > 0 { + p.closures = p.closures[:n] + } + fallthrough + + case scanObjectValue, scanArrayValue: + // done scanning value + p.setKey() + if p.keeping && depth < p.keepDepth { + p.keeping = false + } + + case scanBeginLiteral, scanContinue: + // starting or continuing a literal + if p.key { + // it's a key + buf = append(buf, c) + } else if !p.keeping { + // it's a value we're not keeping + if !p.wiped { + out.Write([]byte(`"?"`)) + p.wiped = true + } + continue + } + + case scanObjectKey: + // done scanning key + k := strings.Trim(string(buf), `"`) + if !p.keeping && p.keepers[k] { + // we should not obfuscate values of this key + p.keeping = true + p.keepDepth = depth + 1 + } + buf = buf[:0] + p.key = false + + case scanSkipSpace: + continue + + case scanError: + // we've encountered an error, mark that there might be more JSON + // using the ellipsis and return whatever we've managed to obfuscate + // thus far. + out.Write([]byte("...")) + return out.String(), p.scan.err + } + out.WriteByte(c) + } + if p.scan.eof() == scanError { + // if an error occurred it's fine, simply add the ellipsis to indicate + // that the input has been truncated. + out.Write([]byte("...")) + return out.String(), p.scan.err + } + return out.String(), nil +} + +func stringOp(op int) string { + return [...]string{ + "Continue", + "BeginLiteral", + "BeginObject", + "ObjectKey", + "ObjectValue", + "EndObject", + "BeginArray", + "ArrayValue", + "EndArray", + "SkipSpace", + "End", + "Error", + }[op] +} diff --git a/pkg/trace/obfuscate/json_scanner.go b/pkg/trace/obfuscate/json_scanner.go new file mode 100644 index 0000000000000..57293fce987fb --- /dev/null +++ b/pkg/trace/obfuscate/json_scanner.go @@ -0,0 +1,581 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// The code that follows is copied from go/src/encoding/json/scanner.go +// It may contain minor edits, such as allowing multiple JSON objects within +// the same input string (see stateEndTop) +// + +package obfuscate + +import "strconv" + +// A SyntaxError is a description of a JSON syntax error. +type SyntaxError struct { + msg string // description of error + Offset int64 // error occurred after reading Offset bytes +} + +func (e *SyntaxError) Error() string { return e.msg } + +// A scanner is a JSON scanning state machine. +// Callers call scan.reset() and then pass bytes in one at a time +// by calling scan.step(&scan, c) for each byte. +// The return value, referred to as an opcode, tells the +// caller about significant parsing events like beginning +// and ending literals, objects, and arrays, so that the +// caller can follow along if it wishes. +// The return value scanEnd indicates that a single top-level +// JSON value has been completed, *before* the byte that +// just got passed in. (The indication must be delayed in order +// to recognize the end of numbers: is 123 a whole value or +// the beginning of 12345e+6?). +type scanner struct { + // The step is a func to be called to execute the next transition. + // Also tried using an integer constant and a single func + // with a switch, but using the func directly was 10% faster + // on a 64-bit Mac Mini, and it's nicer to read. + step func(*scanner, byte) int + + // Reached end of top-level value. + endTop bool + + // Stack of what we're in the middle of - array values, object keys, object values. + parseState []int + + // Error that happened, if any. + err error + + // 1-byte redo (see undo method) + redo bool + redoCode int + redoState func(*scanner, byte) int + + // total bytes consumed, updated by decoder.Decode + bytes int64 +} + +// These values are returned by the state transition functions +// assigned to scanner.state and the method scanner.eof. +// They give details about the current state of the scan that +// callers might be interested to know about. +// It is okay to ignore the return value of any particular +// call to scanner.state: if one call returns scanError, +// every subsequent call will return scanError too. +const ( + // Continue. + scanContinue = iota // uninteresting byte + scanBeginLiteral // end implied by next result != scanContinue + scanBeginObject // begin object + scanObjectKey // just finished object key (string) + scanObjectValue // just finished non-last object value + scanEndObject // end object (implies scanObjectValue if possible) + scanBeginArray // begin array + scanArrayValue // just finished array value + scanEndArray // end array (implies scanArrayValue if possible) + scanSkipSpace // space byte; can skip; known to be last "continue" result + + // Stop. + scanEnd // top-level value ended *before* this byte; known to be first "stop" result + scanError // hit an error, scanner.err. +) + +// These values are stored in the parseState stack. +// They give the current state of a composite value +// being scanned. If the parser is inside a nested value +// the parseState describes the nested state, outermost at entry 0. +const ( + parseObjectKey = iota // parsing object key (before colon) + parseObjectValue // parsing object value (after colon) + parseArrayValue // parsing array value +) + +// reset prepares the scanner for use. +// It must be called before calling s.step. +func (s *scanner) reset() { + s.step = stateBeginValue + s.parseState = s.parseState[0:0] + s.err = nil + s.redo = false + s.endTop = false +} + +// eof tells the scanner that the end of input has been reached. +// It returns a scan status just as s.step does. +func (s *scanner) eof() int { + if s.err != nil { + return scanError + } + if s.endTop { + return scanEnd + } + s.step(s, ' ') + if s.endTop { + return scanEnd + } + if s.err == nil { + s.err = &SyntaxError{"unexpected end of JSON input", s.bytes} + } + return scanError +} + +// pushParseState pushes a new parse state p onto the parse stack. +func (s *scanner) pushParseState(p int) { + s.parseState = append(s.parseState, p) +} + +// popParseState pops a parse state (already obtained) off the stack +// and updates s.step accordingly. +func (s *scanner) popParseState() { + n := len(s.parseState) - 1 + s.parseState = s.parseState[0:n] + s.redo = false + if n == 0 { + s.step = stateEndTop + s.endTop = true + } else { + s.step = stateEndValue + } +} + +func isSpace(c byte) bool { + return c == ' ' || c == '\t' || c == '\r' || c == '\n' +} + +// stateBeginValueOrEmpty is the state after reading `[`. +func stateBeginValueOrEmpty(s *scanner, c byte) int { + if c <= ' ' && isSpace(c) { + return scanSkipSpace + } + if c == ']' { + return stateEndValue(s, c) + } + return stateBeginValue(s, c) +} + +// stateBeginValue is the state at the beginning of the input. +func stateBeginValue(s *scanner, c byte) int { + if c <= ' ' && isSpace(c) { + return scanSkipSpace + } + switch c { + case '{': + s.step = stateBeginStringOrEmpty + s.pushParseState(parseObjectKey) + return scanBeginObject + case '[': + s.step = stateBeginValueOrEmpty + s.pushParseState(parseArrayValue) + return scanBeginArray + case '"': + s.step = stateInString + return scanBeginLiteral + case '-': + s.step = stateNeg + return scanBeginLiteral + case '0': // beginning of 0.123 + s.step = state0 + return scanBeginLiteral + case 't': // beginning of true + s.step = stateT + return scanBeginLiteral + case 'f': // beginning of false + s.step = stateF + return scanBeginLiteral + case 'n': // beginning of null + s.step = stateN + return scanBeginLiteral + } + if '1' <= c && c <= '9' { // beginning of 1234.5 + s.step = state1 + return scanBeginLiteral + } + return s.error(c, "looking for beginning of value") +} + +// stateBeginStringOrEmpty is the state after reading `{`. +func stateBeginStringOrEmpty(s *scanner, c byte) int { + if c <= ' ' && isSpace(c) { + return scanSkipSpace + } + if c == '}' { + n := len(s.parseState) + s.parseState[n-1] = parseObjectValue + return stateEndValue(s, c) + } + return stateBeginString(s, c) +} + +// stateBeginString is the state after reading `{"key": value,`. +func stateBeginString(s *scanner, c byte) int { + if c <= ' ' && isSpace(c) { + return scanSkipSpace + } + if c == '"' { + s.step = stateInString + return scanBeginLiteral + } + return s.error(c, "looking for beginning of object key string") +} + +// stateEndValue is the state after completing a value, +// such as after reading `{}` or `true` or `["x"`. +func stateEndValue(s *scanner, c byte) int { + n := len(s.parseState) + if n == 0 { + // Completed top-level before the current byte. + s.step = stateEndTop + s.endTop = true + return stateEndTop(s, c) + } + if c <= ' ' && isSpace(c) { + s.step = stateEndValue + return scanSkipSpace + } + ps := s.parseState[n-1] + switch ps { + case parseObjectKey: + if c == ':' { + s.parseState[n-1] = parseObjectValue + s.step = stateBeginValue + return scanObjectKey + } + return s.error(c, "after object key") + case parseObjectValue: + if c == ',' { + s.parseState[n-1] = parseObjectKey + s.step = stateBeginString + return scanObjectValue + } + if c == '}' { + s.popParseState() + return scanEndObject + } + return s.error(c, "after object key:value pair") + case parseArrayValue: + if c == ',' { + s.step = stateBeginValue + return scanArrayValue + } + if c == ']' { + s.popParseState() + return scanEndArray + } + return s.error(c, "after array element") + } + return s.error(c, "") +} + +// stateEndTop is the state after finishing the top-level value, +// such as after reading `{}` or `[1,2,3]`. +// Only space characters should be seen now. +func stateEndTop(s *scanner, c byte) int { + if c != ' ' && c != '\t' && c != '\r' && c != '\n' { + // The former behaviour has been removed. Now, if anything + // other than whitespace follows, we assume a new JSON string + // might be starting. This allows us to continue obfuscating + // further strings in cases where there are multiple JSON + // objects enumerated sequentially within the same input. + // This is a common case for ElasticSearch response bodies. + s.reset() + return s.step(s, c) + } + return scanEnd +} + +// stateInString is the state after reading `"`. +func stateInString(s *scanner, c byte) int { + if c == '"' { + s.step = stateEndValue + return scanContinue + } + if c == '\\' { + s.step = stateInStringEsc + return scanContinue + } + if c < 0x20 { + return s.error(c, "in string literal") + } + return scanContinue +} + +// stateInStringEsc is the state after reading `"\` during a quoted string. +func stateInStringEsc(s *scanner, c byte) int { + switch c { + case 'b', 'f', 'n', 'r', 't', '\\', '/', '"': + s.step = stateInString + return scanContinue + case 'u': + s.step = stateInStringEscU + return scanContinue + } + return s.error(c, "in string escape code") +} + +// stateInStringEscU is the state after reading `"\u` during a quoted string. +func stateInStringEscU(s *scanner, c byte) int { + if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { + s.step = stateInStringEscU1 + return scanContinue + } + // numbers + return s.error(c, "in \\u hexadecimal character escape") +} + +// stateInStringEscU1 is the state after reading `"\u1` during a quoted string. +func stateInStringEscU1(s *scanner, c byte) int { + if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { + s.step = stateInStringEscU12 + return scanContinue + } + // numbers + return s.error(c, "in \\u hexadecimal character escape") +} + +// stateInStringEscU12 is the state after reading `"\u12` during a quoted string. +func stateInStringEscU12(s *scanner, c byte) int { + if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { + s.step = stateInStringEscU123 + return scanContinue + } + // numbers + return s.error(c, "in \\u hexadecimal character escape") +} + +// stateInStringEscU123 is the state after reading `"\u123` during a quoted string. +func stateInStringEscU123(s *scanner, c byte) int { + if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { + s.step = stateInString + return scanContinue + } + // numbers + return s.error(c, "in \\u hexadecimal character escape") +} + +// stateNeg is the state after reading `-` during a number. +func stateNeg(s *scanner, c byte) int { + if c == '0' { + s.step = state0 + return scanContinue + } + if '1' <= c && c <= '9' { + s.step = state1 + return scanContinue + } + return s.error(c, "in numeric literal") +} + +// state1 is the state after reading a non-zero integer during a number, +// such as after reading `1` or `100` but not `0`. +func state1(s *scanner, c byte) int { + if '0' <= c && c <= '9' { + s.step = state1 + return scanContinue + } + return state0(s, c) +} + +// state0 is the state after reading `0` during a number. +func state0(s *scanner, c byte) int { + if c == '.' { + s.step = stateDot + return scanContinue + } + if c == 'e' || c == 'E' { + s.step = stateE + return scanContinue + } + return stateEndValue(s, c) +} + +// stateDot is the state after reading the integer and decimal point in a number, +// such as after reading `1.`. +func stateDot(s *scanner, c byte) int { + if '0' <= c && c <= '9' { + s.step = stateDot0 + return scanContinue + } + return s.error(c, "after decimal point in numeric literal") +} + +// stateDot0 is the state after reading the integer, decimal point, and subsequent +// digits of a number, such as after reading `3.14`. +func stateDot0(s *scanner, c byte) int { + if '0' <= c && c <= '9' { + return scanContinue + } + if c == 'e' || c == 'E' { + s.step = stateE + return scanContinue + } + return stateEndValue(s, c) +} + +// stateE is the state after reading the mantissa and e in a number, +// such as after reading `314e` or `0.314e`. +func stateE(s *scanner, c byte) int { + if c == '+' || c == '-' { + s.step = stateESign + return scanContinue + } + return stateESign(s, c) +} + +// stateESign is the state after reading the mantissa, e, and sign in a number, +// such as after reading `314e-` or `0.314e+`. +func stateESign(s *scanner, c byte) int { + if '0' <= c && c <= '9' { + s.step = stateE0 + return scanContinue + } + return s.error(c, "in exponent of numeric literal") +} + +// stateE0 is the state after reading the mantissa, e, optional sign, +// and at least one digit of the exponent in a number, +// such as after reading `314e-2` or `0.314e+1` or `3.14e0`. +func stateE0(s *scanner, c byte) int { + if '0' <= c && c <= '9' { + return scanContinue + } + return stateEndValue(s, c) +} + +// stateT is the state after reading `t`. +func stateT(s *scanner, c byte) int { + if c == 'r' { + s.step = stateTr + return scanContinue + } + return s.error(c, "in literal true (expecting 'r')") +} + +// stateTr is the state after reading `tr`. +func stateTr(s *scanner, c byte) int { + if c == 'u' { + s.step = stateTru + return scanContinue + } + return s.error(c, "in literal true (expecting 'u')") +} + +// stateTru is the state after reading `tru`. +func stateTru(s *scanner, c byte) int { + if c == 'e' { + s.step = stateEndValue + return scanContinue + } + return s.error(c, "in literal true (expecting 'e')") +} + +// stateF is the state after reading `f`. +func stateF(s *scanner, c byte) int { + if c == 'a' { + s.step = stateFa + return scanContinue + } + return s.error(c, "in literal false (expecting 'a')") +} + +// stateFa is the state after reading `fa`. +func stateFa(s *scanner, c byte) int { + if c == 'l' { + s.step = stateFal + return scanContinue + } + return s.error(c, "in literal false (expecting 'l')") +} + +// stateFal is the state after reading `fal`. +func stateFal(s *scanner, c byte) int { + if c == 's' { + s.step = stateFals + return scanContinue + } + return s.error(c, "in literal false (expecting 's')") +} + +// stateFals is the state after reading `fals`. +func stateFals(s *scanner, c byte) int { + if c == 'e' { + s.step = stateEndValue + return scanContinue + } + return s.error(c, "in literal false (expecting 'e')") +} + +// stateN is the state after reading `n`. +func stateN(s *scanner, c byte) int { + if c == 'u' { + s.step = stateNu + return scanContinue + } + return s.error(c, "in literal null (expecting 'u')") +} + +// stateNu is the state after reading `nu`. +func stateNu(s *scanner, c byte) int { + if c == 'l' { + s.step = stateNul + return scanContinue + } + return s.error(c, "in literal null (expecting 'l')") +} + +// stateNul is the state after reading `nul`. +func stateNul(s *scanner, c byte) int { + if c == 'l' { + s.step = stateEndValue + return scanContinue + } + return s.error(c, "in literal null (expecting 'l')") +} + +// stateError is the state after reaching a syntax error, +// such as after reading `[1}` or `5.1.2`. +func stateError(s *scanner, c byte) int { + return scanError +} + +// error records an error and switches to the error state. +func (s *scanner) error(c byte, context string) int { + s.step = stateError + s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes} + return scanError +} + +// quoteChar formats c as a quoted character literal +func quoteChar(c byte) string { + // special cases - different from quoted strings + if c == '\'' { + return `'\''` + } + if c == '"' { + return `'"'` + } + + // use quoted string with different quotation marks + s := strconv.Quote(string(c)) + return "'" + s[1:len(s)-1] + "'" +} + +// undo causes the scanner to return scanCode from the next state transition. +// This gives callers a simple 1-byte undo mechanism. +func (s *scanner) undo(scanCode int) { + if s.redo { + panic("json: invalid use of scanner") + } + s.redoCode = scanCode + s.redoState = s.step + s.step = stateRedo + s.redo = true +} + +// stateRedo helps implement the scanner's 1-byte undo. +func stateRedo(s *scanner, c byte) int { + s.redo = false + s.step = s.redoState + return s.redoCode +} diff --git a/pkg/trace/obfuscate/json_test.go b/pkg/trace/obfuscate/json_test.go new file mode 100644 index 0000000000000..4ccf3c4b13dc9 --- /dev/null +++ b/pkg/trace/obfuscate/json_test.go @@ -0,0 +1,119 @@ +package obfuscate + +import ( + "encoding/json" + "encoding/xml" + "log" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/stretchr/testify/assert" +) + +// obfuscateTestFile contains all the tests for JSON obfuscation +const obfuscateTestFile = "./testdata/json_tests.xml" + +type xmlObfuscateTests struct { + XMLName xml.Name `xml:"ObfuscateTests,-"` + Tests []*xmlObfuscateTest `xml:"TestSuite>Test"` +} + +type xmlObfuscateTest struct { + Tag string + DontNormalize bool // this test contains invalid JSON + In string + Out string + KeepValues []string `xml:"KeepValues>key"` +} + +// loadTests loads all XML tests from ./testdata/obfuscate.xml +func loadTests() ([]*xmlObfuscateTest, error) { + path, err := filepath.Abs(obfuscateTestFile) + if err != nil { + return nil, err + } + f, err := os.Open(path) + defer f.Close() + if err != nil { + return nil, err + } + var suite xmlObfuscateTests + if err := xml.NewDecoder(f).Decode(&suite); err != nil { + return nil, err + } + for _, test := range suite.Tests { + // normalize JSON output + if !test.DontNormalize { + test.Out = normalize(test.Out) + test.In = normalize(test.In) + } + } + return suite.Tests, err +} + +// normalize normalizes JSON input. This allows us to write "pretty" JSON +// inside the test file using \t, \r, \n, etc. +func normalize(in string) string { + var tmp map[string]interface{} + if err := json.Unmarshal([]byte(in), &tmp); err != nil { + log.Fatal(err) + } + out, err := json.Marshal(tmp) + if err != nil { + log.Fatal(err) + } + return string(out) +} + +// jsonSuite holds the JSON test suite. It is loaded in TestMain. +var jsonSuite []*xmlObfuscateTest + +func TestObfuscateJSON(t *testing.T) { + runTest := func(s *xmlObfuscateTest) func(*testing.T) { + return func(t *testing.T) { + assert := assert.New(t) + cfg := &config.JSONObfuscationConfig{KeepValues: s.KeepValues} + out, err := newJSONObfuscator(cfg).obfuscate([]byte(s.In)) + if !s.DontNormalize { + assert.NoError(err) + } + assert.Equal(s.Out, out) + } + } + for i, s := range jsonSuite { + var name string + if s.DontNormalize { + name += "invalid/" + } + name += strconv.Itoa(i + 1) + t.Run(name, runTest(s)) + } +} + +func BenchmarkObfuscateJSON(b *testing.B) { + cfg := &config.JSONObfuscationConfig{KeepValues: []string{"highlight"}} + if len(jsonSuite) == 0 { + b.Fatal("no test suite loaded") + } + var ran int + for i := len(jsonSuite) - 1; i >= 0; i-- { + ran++ + if ran > 3 { + // run max 3 benchmarks + break + } + test := jsonSuite[i] + b.Run(strconv.Itoa(len(test.In)), func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _, err := newJSONObfuscator(cfg).obfuscate([]byte(test.In)) + if !test.DontNormalize && err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/pkg/trace/obfuscate/memcached.go b/pkg/trace/obfuscate/memcached.go new file mode 100644 index 0000000000000..707c34c081762 --- /dev/null +++ b/pkg/trace/obfuscate/memcached.go @@ -0,0 +1,21 @@ +package obfuscate + +import ( + "strings" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +func (*Obfuscator) obfuscateMemcached(span *pb.Span) { + const k = "memcached.command" + if span.Meta == nil || span.Meta[k] == "" { + return + } + // All memcached commands end with new lines [1]. In the case of storage + // commands, key values follow after. Knowing this, all we have to do + // to obfuscate sensitive information is to remove everything that follows + // a new line. For non-storage commands, this will have no effect. + // [1]: https://github.com/memcached/memcached/blob/master/doc/protocol.txt + cmd := strings.SplitN(span.Meta[k], "\r\n", 2)[0] + span.Meta[k] = strings.TrimSpace(cmd) +} diff --git a/pkg/trace/obfuscate/memcached_test.go b/pkg/trace/obfuscate/memcached_test.go new file mode 100644 index 0000000000000..457a81a3c85e5 --- /dev/null +++ b/pkg/trace/obfuscate/memcached_test.go @@ -0,0 +1,43 @@ +package obfuscate + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func TestObfuscateMemcached(t *testing.T) { + const k = "memcached.command" + for _, tt := range []struct { + in, out string + }{ + { + "set mykey 0 60 5\r\nvalue", + "set mykey 0 60 5", + }, + { + "get mykey", + "get mykey", + }, + { + "add newkey 0 60 5\r\nvalue", + "add newkey 0 60 5", + }, + { + "add newkey 0 60 5\r\nvalue", + "add newkey 0 60 5", + }, + { + "decr mykey 5", + "decr mykey 5", + }, + } { + span := pb.Span{ + Type: "memcached", + Meta: map[string]string{k: tt.in}, + } + NewObfuscator(nil).obfuscateMemcached(&span) + assert.Equal(t, tt.out, span.Meta[k]) + } +} diff --git a/pkg/trace/obfuscate/obfuscate.go b/pkg/trace/obfuscate/obfuscate.go new file mode 100644 index 0000000000000..12c09db40ca37 --- /dev/null +++ b/pkg/trace/obfuscate/obfuscate.go @@ -0,0 +1,92 @@ +// Package obfuscate implements quantizing and obfuscating of tags and resources for +// a set of spans matching a certain criteria. +package obfuscate + +import ( + "bytes" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// Obfuscator quantizes and obfuscates spans. The obfuscator is not safe for +// concurrent use. +type Obfuscator struct { + opts *config.ObfuscationConfig + sql *sqlObfuscator + es *jsonObfuscator // nil if disabled + mongo *jsonObfuscator // nil if disabled +} + +// NewObfuscator creates a new Obfuscator. +func NewObfuscator(cfg *config.ObfuscationConfig) *Obfuscator { + if cfg == nil { + cfg = new(config.ObfuscationConfig) + } + o := Obfuscator{ + opts: cfg, + sql: newSQLObfuscator(), + } + if cfg.ES.Enabled { + o.es = newJSONObfuscator(&cfg.ES) + } + if cfg.Mongo.Enabled { + o.mongo = newJSONObfuscator(&cfg.Mongo) + } + return &o +} + +// Obfuscate may obfuscate span's properties based on its type and on the Obfuscator's +// configuration. +func (o *Obfuscator) Obfuscate(span *pb.Span) { + switch span.Type { + case "sql", "cassandra": + o.obfuscateSQL(span) + case "redis": + o.quantizeRedis(span) + if o.opts.Redis.Enabled { + o.obfuscateRedis(span) + } + case "memcached": + if o.opts.Memcached.Enabled { + o.obfuscateMemcached(span) + } + case "web", "http": + o.obfuscateHTTP(span) + case "mongodb": + o.obfuscateJSON(span, "mongodb.query", o.mongo) + case "elasticsearch": + o.obfuscateJSON(span, "elasticsearch.body", o.es) + } +} + +// compactWhitespaces compacts all whitespaces in t. +func compactWhitespaces(t string) string { + n := len(t) + r := make([]byte, n) + spaceCode := uint8(32) + isWhitespace := func(char uint8) bool { return char == spaceCode } + nr := 0 + offset := 0 + for i := 0; i < n; i++ { + if isWhitespace(t[i]) { + copy(r[nr:], t[nr+offset:i]) + r[i-offset] = spaceCode + nr = i + 1 - offset + for j := i + 1; j < n; j++ { + if !isWhitespace(t[j]) { + offset += j - i - 1 + i = j + break + } else if j == n-1 { + offset += j - i + i = j + break + } + } + } + } + copy(r[nr:], t[nr+offset:n]) + r = r[:n-offset] + return string(bytes.Trim(r, " ")) +} diff --git a/pkg/trace/obfuscate/obfuscate_test.go b/pkg/trace/obfuscate/obfuscate_test.go new file mode 100644 index 0000000000000..21a4792825e3f --- /dev/null +++ b/pkg/trace/obfuscate/obfuscate_test.go @@ -0,0 +1,233 @@ +package obfuscate + +import ( + "flag" + "log" + "os" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/cihub/seelog" + "github.com/stretchr/testify/assert" +) + +type compactSpacesTestCase struct { + before string + after string +} + +func TestMain(m *testing.M) { + flag.Parse() + + // disable loggging in tests + seelog.UseLogger(seelog.Disabled) + + // prepare JSON obfuscator tests + suite, err := loadTests() + if err != nil { + log.Fatal(err) + } + if len(suite) == 0 { + log.Fatal("no tests in suite") + } + jsonSuite = suite + + os.Exit(m.Run()) +} + +func TestNewObfuscator(t *testing.T) { + assert := assert.New(t) + o := NewObfuscator(nil) + assert.Nil(o.es) + assert.Nil(o.mongo) + + o = NewObfuscator(&config.ObfuscationConfig{ + ES: config.JSONObfuscationConfig{}, + Mongo: config.JSONObfuscationConfig{}, + }) + assert.Nil(o.es) + assert.Nil(o.mongo) + + o = NewObfuscator(&config.ObfuscationConfig{ + ES: config.JSONObfuscationConfig{Enabled: true}, + Mongo: config.JSONObfuscationConfig{Enabled: true}, + }) + assert.NotNil(o.es) + assert.NotNil(o.mongo) +} + +func TestCompactWhitespaces(t *testing.T) { + assert := assert.New(t) + + resultsToExpect := []compactSpacesTestCase{ + {"aa", + "aa"}, + + {" aa bb", + "aa bb"}, + + {"aa bb cc dd ", + "aa bb cc dd"}, + + {" ", + ""}, + + {"a b cde fg hi j jk lk lkjfdsalfd afsd sfdafsd f", + "a b cde fg hi j jk lk lkjfdsalfd afsd sfdafsd f"}, + + {" ¡™£¢∞§¶ •ªº–≠œ∑´®†¥¨ˆøπ “‘«åß∂ƒ©˙∆˚¬…æΩ≈ç√ ∫˜µ≤≥÷ ", + "¡™£¢∞§¶ •ªº–≠œ∑´®†¥¨ˆøπ “‘«åß∂ƒ©˙∆˚¬…æΩ≈ç√ ∫˜µ≤≥÷"}, + } + + for _, testCase := range resultsToExpect { + assert.Equal(testCase.after, compactWhitespaces(testCase.before)) + } +} + +// TestObfuscateDefaults ensures that running the obfuscator with no config continues to obfuscate/quantize +// SQL queries and Redis commands in span resources. +func TestObfuscateDefaults(t *testing.T) { + t.Run("redis", func(t *testing.T) { + cmd := "SET k v\nGET k" + span := &pb.Span{ + Type: "redis", + Resource: cmd, + Meta: map[string]string{"redis.raw_command": cmd}, + } + NewObfuscator(nil).Obfuscate(span) + assert.Equal(t, cmd, span.Meta["redis.raw_command"]) + assert.Equal(t, "SET GET", span.Resource) + }) + + t.Run("sql", func(t *testing.T) { + query := "UPDATE users(name) SET ('Jim')" + span := &pb.Span{ + Type: "sql", + Resource: query, + Meta: map[string]string{"sql.query": query}, + } + NewObfuscator(nil).Obfuscate(span) + assert.Equal(t, query, span.Meta["sql.query"]) + assert.Equal(t, "UPDATE users ( name ) SET ( ? )", span.Resource) + }) +} + +func TestObfuscateConfig(t *testing.T) { + // testConfig returns a test function which creates a span of type typ, + // having a tag with key/val, runs the obfuscator on it using the given + // configuration and asserts that the new tag value matches exp. + testConfig := func( + typ, key, val, exp string, + cfg *config.ObfuscationConfig, + ) func(*testing.T) { + return func(t *testing.T) { + span := &pb.Span{Type: typ, Meta: map[string]string{key: val}} + NewObfuscator(cfg).Obfuscate(span) + assert.Equal(t, exp, span.Meta[key]) + } + } + + t.Run("redis/enabled", testConfig( + "redis", + "redis.raw_command", + "SET key val", + "SET key ?", + &config.ObfuscationConfig{ + Redis: config.Enablable{Enabled: true}, + }, + )) + + t.Run("redis/disabled", testConfig( + "redis", + "redis.raw_command", + "SET key val", + "SET key val", + &config.ObfuscationConfig{}, + )) + + t.Run("http/enabled", testConfig( + "http", + "http.url", + "http://mysite.mydomain/1/2?q=asd", + "http://mysite.mydomain/?/??", + &config.ObfuscationConfig{ + HTTP: config.HTTPObfuscationConfig{ + RemovePathDigits: true, + RemoveQueryString: true, + }, + }, + )) + + t.Run("http/disabled", testConfig( + "http", + "http.url", + "http://mysite.mydomain/1/2?q=asd", + "http://mysite.mydomain/1/2?q=asd", + &config.ObfuscationConfig{}, + )) + + t.Run("web/enabled", testConfig( + "web", + "http.url", + "http://mysite.mydomain/1/2?q=asd", + "http://mysite.mydomain/?/??", + &config.ObfuscationConfig{ + HTTP: config.HTTPObfuscationConfig{ + RemovePathDigits: true, + RemoveQueryString: true, + }, + }, + )) + + t.Run("web/disabled", testConfig( + "web", + "http.url", + "http://mysite.mydomain/1/2?q=asd", + "http://mysite.mydomain/1/2?q=asd", + &config.ObfuscationConfig{}, + )) + + t.Run("json/enabled", testConfig( + "elasticsearch", + "elasticsearch.body", + `{"role": "database"}`, + `{"role":"?"}`, + &config.ObfuscationConfig{ + ES: config.JSONObfuscationConfig{Enabled: true}, + }, + )) + + t.Run("json/disabled", testConfig( + "elasticsearch", + "elasticsearch.body", + `{"role": "database"}`, + `{"role": "database"}`, + &config.ObfuscationConfig{}, + )) + + t.Run("memcached/enabled", testConfig( + "memcached", + "memcached.command", + "set key 0 0 0\r\nvalue", + "set key 0 0 0", + &config.ObfuscationConfig{ + Memcached: config.Enablable{Enabled: true}, + }, + )) + + t.Run("memcached/disabled", testConfig( + "memcached", + "memcached.command", + "set key 0 0 0 noreply\r\nvalue", + "set key 0 0 0 noreply\r\nvalue", + &config.ObfuscationConfig{}, + )) +} + +func BenchmarkCompactWhitespaces(b *testing.B) { + str := "a b cde fg hi j jk lk lkjfdsalfd afsd sfdafsd f" + for i := 0; i < b.N; i++ { + compactWhitespaces(str) + } +} diff --git a/pkg/trace/obfuscate/redis.go b/pkg/trace/obfuscate/redis.go new file mode 100644 index 0000000000000..c3f0e329abeac --- /dev/null +++ b/pkg/trace/obfuscate/redis.go @@ -0,0 +1,257 @@ +package obfuscate + +import ( + "strings" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// redisTruncationMark is used as suffix by tracing libraries to indicate that a +// command was truncated. +const redisTruncationMark = "..." + +const maxRedisNbCommands = 3 + +// Redis commands consisting in 2 words +var redisCompoundCommandSet = map[string]bool{ + "CLIENT": true, "CLUSTER": true, "COMMAND": true, "CONFIG": true, "DEBUG": true, "SCRIPT": true} + +// quantizeRedis generates resource for Redis spans +// TODO(gbbr): Refactor this method to use the tokenizer and +// remove "compactWhitespaces". This method is buggy when commands +// contain quoted strings with newlines. +func (*Obfuscator) quantizeRedis(span *pb.Span) { + query := compactWhitespaces(span.Resource) + + var resource strings.Builder + truncated := false + nbCmds := 0 + + for len(query) > 0 && nbCmds < maxRedisNbCommands { + var rawLine string + + // Read the next command + idx := strings.IndexByte(query, '\n') + if idx == -1 { + rawLine = query + query = "" + } else { + rawLine = query[:idx] + query = query[idx+1:] + } + + line := strings.Trim(rawLine, " ") + if len(line) == 0 { + continue + } + + // Parse arguments + args := strings.SplitN(line, " ", 3) + + if strings.HasSuffix(args[0], redisTruncationMark) { + truncated = true + continue + } + + command := strings.ToUpper(args[0]) + + if redisCompoundCommandSet[command] && len(args) > 1 { + if strings.HasSuffix(args[1], redisTruncationMark) { + truncated = true + continue + } + + command += " " + strings.ToUpper(args[1]) + } + + // Write the command representation + resource.WriteByte(' ') + resource.WriteString(command) + + nbCmds++ + truncated = false + } + + if nbCmds == maxRedisNbCommands || truncated { + resource.WriteString(" ...") + } + + span.Resource = strings.Trim(resource.String(), " ") +} + +const redisRawCommand = "redis.raw_command" + +// obfuscateRedis obfuscates arguments inside the given span's "redis.raw_command" tag, if it exists +// and is non-empty. +func (*Obfuscator) obfuscateRedis(span *pb.Span) { + if span.Meta == nil || span.Meta[redisRawCommand] == "" { + // nothing to do + return + } + t := newRedisTokenizer([]byte(span.Meta[redisRawCommand])) + var ( + str strings.Builder + cmd string + args []string + ) + for { + tok, typ, done := t.scan() + switch typ { + case redisTokenCommand: + // new command starting + if cmd != "" { + // a previous command was buffered, obfuscate it + obfuscateRedisCmd(&str, cmd, args...) + str.WriteByte('\n') + } + cmd = tok + args = args[:0] + case redisTokenArgument: + args = append(args, tok) + } + if done { + // last command + obfuscateRedisCmd(&str, cmd, args...) + break + } + } + span.Meta[redisRawCommand] = str.String() +} + +func obfuscateRedisCmd(out *strings.Builder, cmd string, args ...string) { + out.WriteString(cmd) + if len(args) == 0 { + return + } + out.WriteByte(' ') + + switch strings.ToUpper(cmd) { + case "APPEND", "GETSET", "LPUSHX", "GEORADIUSBYMEMBER", "RPUSHX", + "SET", "SETNX", "SISMEMBER", "ZRANK", "ZREVRANK", "ZSCORE": + // Obfuscate 2nd argument: + // • APPEND key value + // • GETSET key value + // • LPUSHX key value + // • GEORADIUSBYMEMBER key member radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC] [STORE key] [STOREDIST key] + // • RPUSHX key value + // • SET key value [expiration EX seconds|PX milliseconds] [NX|XX] + // • SETNX key value + // • SISMEMBER key member + // • ZRANK key member + // • ZREVRANK key member + // • ZSCORE key member + obfuscateRedisArgN(args, 1) + + case "HSET", "HSETNX", "LREM", "LSET", "SETBIT", "SETEX", "PSETEX", + "SETRANGE", "ZINCRBY", "SMOVE", "RESTORE": + // Obfuscate 3rd argument: + // • HSET key field value + // • HSETNX key field value + // • LREM key count value + // • LSET key index value + // • SETBIT key offset value + // • SETEX key seconds value + // • PSETEX key milliseconds value + // • SETRANGE key offset value + // • ZINCRBY key increment member + // • SMOVE source destination member + // • RESTORE key ttl serialized-value [REPLACE] + obfuscateRedisArgN(args, 2) + + case "LINSERT": + // Obfuscate 4th argument: + // • LINSERT key BEFORE|AFTER pivot value + obfuscateRedisArgN(args, 3) + + case "GEOHASH", "GEOPOS", "GEODIST", "LPUSH", "RPUSH", "SREM", + "ZREM", "SADD": + // Obfuscate all arguments after the first one. + // • GEOHASH key member [member ...] + // • GEOPOS key member [member ...] + // • GEODIST key member1 member2 [unit] + // • LPUSH key value [value ...] + // • RPUSH key value [value ...] + // • SREM key member [member ...] + // • ZREM key member [member ...] + // • SADD key member [member ...] + if len(args) > 1 { + args[1] = "?" + args = args[:2] + } + + case "GEOADD": + // Obfuscating every 3rd argument starting from first + // • GEOADD key longitude latitude member [longitude latitude member ...] + obfuscateRedisArgsStep(args, 1, 3) + + case "HMSET": + // Every 2nd argument starting from first. + // • HMSET key field value [field value ...] + obfuscateRedisArgsStep(args, 1, 2) + + case "MSET", "MSETNX": + // Every 2nd argument starting from command. + // • MSET key value [key value ...] + // • MSETNX key value [key value ...] + obfuscateRedisArgsStep(args, 0, 2) + + case "CONFIG": + // Obfuscate 2nd argument to SET sub-command. + // • CONFIG SET parameter value + if strings.ToUpper(args[0]) == "SET" { + obfuscateRedisArgN(args, 2) + } + + case "BITFIELD": + // Obfuscate 3rd argument to SET sub-command: + // • BITFIELD key [GET type offset] [SET type offset value] [INCRBY type offset increment] [OVERFLOW WRAP|SAT|FAIL] + var n int + for i, arg := range args { + if strings.ToUpper(arg) == "SET" { + n = i + } + if n > 0 && i-n == 3 { + args[i] = "?" + break + } + } + + case "ZADD": + // Obfuscate every 2nd argument after potential optional ones. + // • ZADD key [NX|XX] [CH] [INCR] score member [score member ...] + var i int + loop: + for i = range args { + if i == 0 { + continue // key + } + switch args[i] { + case "NX", "XX", "CH", "INCR": + // continue + default: + break loop + } + } + obfuscateRedisArgsStep(args, i, 2) + + default: + // Obfuscate nothing. + } + out.WriteString(strings.Join(args, " ")) +} + +func obfuscateRedisArgN(args []string, n int) { + if len(args) > n { + args[n] = "?" + } +} + +func obfuscateRedisArgsStep(args []string, start, step int) { + if start+step-1 >= len(args) { + // can't reach target + return + } + for i := start + step - 1; i < len(args); i += step { + args[i] = "?" + } +} diff --git a/pkg/trace/obfuscate/redis_test.go b/pkg/trace/obfuscate/redis_test.go new file mode 100644 index 0000000000000..798cdbe547094 --- /dev/null +++ b/pkg/trace/obfuscate/redis_test.go @@ -0,0 +1,381 @@ +package obfuscate + +import ( + "fmt" + "strconv" + "strings" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +type redisTestCase struct { + query string + expectedResource string +} + +func redisSpan(query string) *pb.Span { + return &pb.Span{ + Resource: query, + Type: "redis", + Meta: map[string]string{redisRawCommand: query}, + } +} + +func TestRedisQuantizer(t *testing.T) { + assert := assert.New(t) + + queryToExpected := []redisTestCase{ + {"CLIENT", + "CLIENT"}, // regression test for DataDog/datadog-trace-agent#421 + + {"CLIENT LIST", + "CLIENT LIST"}, + + {"get my_key", + "GET"}, + + {"SET le_key le_value", + "SET"}, + + {"\n\n \nSET foo bar \n \n\n ", + "SET"}, + + {"CONFIG SET parameter value", + "CONFIG SET"}, + + {"SET toto tata \n \n EXPIRE toto 15 ", + "SET EXPIRE"}, + + {"MSET toto tata toto tata toto tata \n ", + "MSET"}, + + {"MULTI\nSET k1 v1\nSET k2 v2\nSET k3 v3\nSET k4 v4\nDEL to_del\nEXEC", + "MULTI SET SET ..."}, + + {"DEL k1\nDEL k2\nHMSET k1 \"a\" 1 \"b\" 2 \"c\" 3\nHMSET k2 \"d\" \"4\" \"e\" \"4\"\nDEL k3\nHMSET k3 \"f\" \"5\"\nDEL k1\nDEL k2\nHMSET k1 \"a\" 1 \"b\" 2 \"c\" 3\nHMSET k2 \"d\" \"4\" \"e\" \"4\"\nDEL k3\nHMSET k3 \"f\" \"5\"\nDEL k1\nDEL k2\nHMSET k1 \"a\" 1 \"b\" 2 \"c\" 3\nHMSET k2 \"d\" \"4\" \"e\" \"4\"\nDEL k3\nHMSET k3 \"f\" \"5\"\nDEL k1\nDEL k2\nHMSET k1 \"a\" 1 \"b\" 2 \"c\" 3\nHMSET k2 \"d\" \"4\" \"e\" \"4\"\nDEL k3\nHMSET k3 \"f\" \"5\"", + "DEL DEL HMSET ..."}, + + {"GET...", + "..."}, + + {"GET k...", + "GET"}, + + {"GET k1\nGET k2\nG...", + "GET GET ..."}, + + {"GET k1\nGET k2\nDEL k3\nGET k...", + "GET GET DEL ..."}, + + {"GET k1\nGET k2\nHDEL k3 a\nG...", + "GET GET HDEL ..."}, + + {"GET k...\nDEL k2\nMS...", + "GET DEL ..."}, + + {"GET k...\nDE...\nMS...", + "GET ..."}, + + {"GET k1\nDE...\nGET k2", + "GET GET"}, + + {"GET k1\nDE...\nGET k2\nHDEL k3 a\nGET k4\nDEL k5", + "GET GET HDEL ..."}, + + {"UNKNOWN 123", + "UNKNOWN"}, + } + + for _, testCase := range queryToExpected { + s := redisSpan(testCase.query) + NewObfuscator(nil).Obfuscate(s) + assert.Equal(testCase.expectedResource, s.Resource) + } +} + +func TestRedisObfuscator(t *testing.T) { + for ti, tt := range [...]struct { + in, out string + }{ + { + "APPEND key value", + "APPEND key ?", + }, + { + "GETSET key value", + "GETSET key ?", + }, + { + "LPUSHX key value", + "LPUSHX key ?", + }, + { + "GEORADIUSBYMEMBER key member radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC] [STORE key] [STOREDIST key]", + "GEORADIUSBYMEMBER key ? radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC] [STORE key] [STOREDIST key]", + }, + { + "RPUSHX key value", + "RPUSHX key ?", + }, + { + "SET key value", + "SET key ?", + }, + { + "SET key value [expiration EX seconds|PX milliseconds] [NX|XX]", + "SET key ? [expiration EX seconds|PX milliseconds] [NX|XX]", + }, + { + "SETNX key value", + "SETNX key ?", + }, + { + "SISMEMBER key member", + "SISMEMBER key ?", + }, + { + "ZRANK key member", + "ZRANK key ?", + }, + { + "ZREVRANK key member", + "ZREVRANK key ?", + }, + { + "ZSCORE key member", + "ZSCORE key ?", + }, + { + "BITFIELD key GET type offset SET type offset value INCRBY type", + "BITFIELD key GET type offset SET type offset ? INCRBY type", + }, + { + "BITFIELD key SET type offset value INCRBY type", + "BITFIELD key SET type offset ? INCRBY type", + }, + { + "BITFIELD key GET type offset INCRBY type", + "BITFIELD key GET type offset INCRBY type", + }, + { + "BITFIELD key SET type offset", + "BITFIELD key SET type offset", + }, + { + "CONFIG SET parameter value", + "CONFIG SET parameter ?", + }, + { + "CONFIG foo bar baz", + "CONFIG foo bar baz", + }, + { + "GEOADD key longitude latitude member longitude latitude member longitude latitude member", + "GEOADD key longitude latitude ? longitude latitude ? longitude latitude ?", + }, + { + "GEOADD key longitude latitude member longitude latitude member", + "GEOADD key longitude latitude ? longitude latitude ?", + }, + { + "GEOADD key longitude latitude member", + "GEOADD key longitude latitude ?", + }, + { + "GEOADD key longitude latitude", + "GEOADD key longitude latitude", + }, + { + "GEOADD key", + "GEOADD key", + }, + { + "GEOHASH key\nGEOPOS key\n GEODIST key", + "GEOHASH key\nGEOPOS key\nGEODIST key", + }, + { + "GEOHASH key member\nGEOPOS key member\nGEODIST key member\n", + "GEOHASH key ?\nGEOPOS key ?\nGEODIST key ?", + }, + { + "GEOHASH key member member member\nGEOPOS key member member \n GEODIST key member member member", + "GEOHASH key ?\nGEOPOS key ?\nGEODIST key ?", + }, + { + "GEOPOS key member [member ...]", + "GEOPOS key ?", + }, + { + "SREM key member [member ...]", + "SREM key ?", + }, + { + "ZREM key member [member ...]", + "ZREM key ?", + }, + { + "SADD key member [member ...]", + "SADD key ?", + }, + { + "GEODIST key member1 member2 [unit]", + "GEODIST key ?", + }, + { + "LPUSH key value [value ...]", + "LPUSH key ?", + }, + { + "RPUSH key value [value ...]", + "RPUSH key ?", + }, + { + "HSET key field value \nHSETNX key field value\nBLAH", + "HSET key field ?\nHSETNX key field ?\nBLAH", + }, + { + "HSET key field value", + "HSET key field ?", + }, + { + "HSETNX key field value", + "HSETNX key field ?", + }, + { + "LREM key count value", + "LREM key count ?", + }, + { + "LSET key index value", + "LSET key index ?", + }, + { + "SETBIT key offset value", + "SETBIT key offset ?", + }, + { + "SETRANGE key offset value", + "SETRANGE key offset ?", + }, + { + "SETEX key seconds value", + "SETEX key seconds ?", + }, + { + "PSETEX key milliseconds value", + "PSETEX key milliseconds ?", + }, + { + "ZINCRBY key increment member", + "ZINCRBY key increment ?", + }, + { + "SMOVE source destination member", + "SMOVE source destination ?", + }, + { + "RESTORE key ttl serialized-value [REPLACE]", + "RESTORE key ttl ? [REPLACE]", + }, + { + "LINSERT key BEFORE pivot value", + "LINSERT key BEFORE pivot ?", + }, + { + "LINSERT key AFTER pivot value", + "LINSERT key AFTER pivot ?", + }, + { + "HMSET key field value field value", + "HMSET key field ? field ?", + }, + { + "HMSET key field value \n HMSET key field value\n\n ", + "HMSET key field ?\nHMSET key field ?", + }, + { + "HMSET key field", + "HMSET key field", + }, + { + "MSET key value key value", + "MSET key ? key ?", + }, + { + "MSET\nMSET key value", + "MSET\nMSET key ?", + }, + { + "MSET key value", + "MSET key ?", + }, + { + "MSETNX key value key value", + "MSETNX key ? key ?", + }, + { + "ZADD key score member score member", + "ZADD key score ? score ?", + }, + { + "ZADD key NX score member score member", + "ZADD key NX score ? score ?", + }, + { + "ZADD key NX CH score member score member", + "ZADD key NX CH score ? score ?", + }, + { + "ZADD key NX CH INCR score member score member", + "ZADD key NX CH INCR score ? score ?", + }, + { + "ZADD key XX INCR score member score member", + "ZADD key XX INCR score ? score ?", + }, + { + "ZADD key XX INCR score member", + "ZADD key XX INCR score ?", + }, + { + "ZADD key XX INCR score", + "ZADD key XX INCR score", + }, + { + ` +CONFIG command +SET k v + `, + `CONFIG command +SET k ?`, + }, + } { + t.Run(strconv.Itoa(ti), func(t *testing.T) { + span := redisSpan(tt.in) + NewObfuscator(nil).obfuscateRedis(span) + assert.Equal(t, tt.out, span.Meta[redisRawCommand], tt.in) + }) + } +} + +func BenchmarkRedisObfuscator(b *testing.B) { + cmd := strings.Repeat("GEOADD key longitude latitude member longitude latitude member longitude latitude member\n", 5) + span := redisSpan(cmd) + b.Run(fmt.Sprintf("%db", len(cmd)), func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + NewObfuscator(nil).obfuscateRedis(span) + } + }) +} + +func BenchmarkRedisQuantizer(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + span := redisSpan(`DEL k1\nDEL k2\nHMSET k1 "a" 1 "b" 2 "c" 3\nHMSET k2 "d" "4" "e" "4"\nDEL k3\nHMSET k3 "f" "5"\nDEL k1\nDEL k2\nHMSET k1 "a" 1 "b" 2 "c" 3\nHMSET k2 "d" "4" "e" "4"\nDEL k3\nHMSET k3 "f" "5"\nDEL k1\nDEL k2\nHMSET k1 "a" 1 "b" 2 "c" 3\nHMSET k2 "d" "4" "e" "4"\nDEL k3\nHMSET k3 "f" "5"\nDEL k1\nDEL k2\nHMSET k1 "a" 1 "b" 2 "c" 3\nHMSET k2 "d" "4" "e" "4"\nDEL k3\nHMSET k3 "f" "5"`) + NewObfuscator(nil).quantizeRedis(span) + } +} diff --git a/pkg/trace/obfuscate/redis_tokenizer.go b/pkg/trace/obfuscate/redis_tokenizer.go new file mode 100644 index 0000000000000..83f3d9aa65984 --- /dev/null +++ b/pkg/trace/obfuscate/redis_tokenizer.go @@ -0,0 +1,182 @@ +package obfuscate + +import ( + "bytes" + "strings" +) + +// redisTokenType specifies the token type returned by the tokenizer. +type redisTokenType int + +const ( + // redisTokenCommand is a command token. For compound tokens, it is + // only the first part up to a space. + redisTokenCommand redisTokenType = iota + + // redisTokenArgument is an argument token. + redisTokenArgument +) + +// String implements fmt.Stringer. +func (t redisTokenType) String() string { + return map[redisTokenType]string{ + redisTokenCommand: "command", + redisTokenArgument: "argument", + }[t] +} + +// redisTokenizer tokenizes a Redis command string. The string can be on +// multiple lines. The tokenizer is capable of parsing quoted strings and escape +// sequences inside them. +type redisTokenizer struct { + data []byte + ch byte + off int + done bool + state redisParseState +} + +// redisParseState specifies the current state of the tokenizer. +type redisParseState int + +const ( + // redisStateCommand specifies that we are about to parse a command. + // It is usually the state at the beginning of the scan or after a + // new line. + redisStateCommand redisParseState = iota + + // redisStateArgument specifies that we are about to parse an argument + // to a command or the rest of the tokens in a compound command. + redisStateArgument +) + +// newRedisTokenizer returns a new tokenizer for the given data. +func newRedisTokenizer(data []byte) *redisTokenizer { + return &redisTokenizer{ + data: bytes.TrimSpace(data), + off: -1, + state: redisStateCommand, + } +} + +// scan returns the next token, it's type and a bool. The boolean specifies if +// the returned token was the last one. +func (t *redisTokenizer) scan() (tok string, typ redisTokenType, done bool) { + switch t.state { + case redisStateCommand: + return t.scanCommand() + default: + return t.scanArg() + } +} + +// next advances the scanner to the next character. +func (t *redisTokenizer) next() { + t.off++ + if t.off <= len(t.data)-1 { + t.ch = t.data[t.off] + return + } + t.done = true +} + +// scanCommand scans a command from the buffer. +func (t *redisTokenizer) scanCommand() (tok string, typ redisTokenType, done bool) { + var ( + str strings.Builder + started bool + ) + for { + t.next() + if t.done { + return str.String(), typ, t.done + } + switch t.ch { + case ' ': + if !started { + // skip spaces preceding token + t.skipSpace() + break + } + // done scanning command + t.state = redisStateArgument + t.skipSpace() + return str.String(), redisTokenCommand, t.done + case '\n': + return str.String(), redisTokenCommand, t.done + default: + str.WriteByte(t.ch) + } + started = true + } +} + +// scanArg scans an argument from the buffer. +func (t *redisTokenizer) scanArg() (tok string, typ redisTokenType, done bool) { + var ( + str strings.Builder + quoted bool // in quoted string + escape bool // escape sequence + ) + for { + t.next() + if t.done { + return str.String(), redisTokenArgument, t.done + } + switch t.ch { + case '\\': + str.WriteByte('\\') + if !escape { + // next character could be escaped + escape = true + continue + } + case '\n': + if !quoted { + // last argument, new command follows + t.state = redisStateCommand + return str.String(), redisTokenArgument, t.done + } + str.WriteByte('\n') + case '"': + str.WriteByte('"') + if !escape { + // this quote wasn't escaped, toggle quoted mode + quoted = !quoted + } + case ' ': + if !quoted { + t.skipSpace() + return str.String(), redisTokenArgument, t.done + } + str.WriteByte(' ') + default: + str.WriteByte(t.ch) + } + escape = false + } +} + +// unread is the reverse of next, unreading a character. +func (t *redisTokenizer) unread() { + if t.off < 1 { + return + } + t.off-- + t.ch = t.data[t.off] +} + +// skipSpace moves the cursor forward until it meets the last space +// in a sequence of contiguous spaces. +func (t *redisTokenizer) skipSpace() { + for t.ch == ' ' || t.ch == '\t' || t.ch == '\r' && !t.done { + t.next() + } + if t.ch == '\n' { + // next token is a command + t.state = redisStateCommand + } else { + // don't steal the first non-space character + t.unread() + } +} diff --git a/pkg/trace/obfuscate/redis_tokenizer_test.go b/pkg/trace/obfuscate/redis_tokenizer_test.go new file mode 100644 index 0000000000000..edc44d4539f5a --- /dev/null +++ b/pkg/trace/obfuscate/redis_tokenizer_test.go @@ -0,0 +1,231 @@ +package obfuscate + +import ( + "strconv" + "testing" +) + +func TestRedisTokenizer(t *testing.T) { + type testResult struct { + tok string + typ redisTokenType + done bool + } + for ti, tt := range []struct { + in string + out []testResult + }{ + { + in: "", + out: []testResult{{"", redisTokenCommand, true}}, + }, + { + in: "BAD\"\"INPUT\" \"boo\n Weird13\\Stuff", + out: []testResult{ + {"BAD\"\"INPUT\"", redisTokenCommand, false}, + {"\"boo\n Weird13\\Stuff", redisTokenArgument, true}, + }, + }, + { + in: "CMD", + out: []testResult{ + {"CMD", redisTokenCommand, true}, + }, + }, + { + in: "\n \nCMD\n \n", + out: []testResult{ + {"CMD", redisTokenCommand, true}, + }, + }, + { + in: " CMD ", + out: []testResult{ + {"CMD", redisTokenCommand, true}, + }, + }, + { + in: "CMD1\nCMD2", + out: []testResult{ + {"CMD1", redisTokenCommand, false}, + {"CMD2", redisTokenCommand, true}, + }, + }, + { + in: " CMD1 \n CMD2 ", + out: []testResult{ + {"CMD1", redisTokenCommand, false}, + {"CMD2", redisTokenCommand, true}, + }, + }, + { + in: "CMD1\nCMD2\nCMD3", + out: []testResult{ + {"CMD1", redisTokenCommand, false}, + {"CMD2", redisTokenCommand, false}, + {"CMD3", redisTokenCommand, true}, + }, + }, + { + in: "CMD1 \n CMD2 \n CMD3 ", + out: []testResult{ + {"CMD1", redisTokenCommand, false}, + {"CMD2", redisTokenCommand, false}, + {"CMD3", redisTokenCommand, true}, + }, + }, + { + in: "CMD arg", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg", redisTokenArgument, true}, + }, + }, + { + in: " CMD arg ", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg", redisTokenArgument, true}, + }, + }, + { + in: "CMD arg1 arg2", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg1", redisTokenArgument, false}, + {"arg2", redisTokenArgument, true}, + }, + }, + { + in: " CMD arg1 arg2 ", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg1", redisTokenArgument, false}, + {"arg2", redisTokenArgument, true}, + }, + }, + { + in: "CMD arg1\nCMD2 arg2", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg1", redisTokenArgument, false}, + {"CMD2", redisTokenCommand, false}, + {"arg2", redisTokenArgument, true}, + }, + }, + { + in: "CMD arg1 arg2\nCMD2 arg3\nCMD3\nCMD4 arg4 arg5 arg6", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg1", redisTokenArgument, false}, + {"arg2", redisTokenArgument, false}, + {"CMD2", redisTokenCommand, false}, + {"arg3", redisTokenArgument, false}, + {"CMD3", redisTokenCommand, false}, + {"CMD4", redisTokenCommand, false}, + {"arg4", redisTokenArgument, false}, + {"arg5", redisTokenArgument, false}, + {"arg6", redisTokenArgument, true}, + }, + }, + { + in: "CMD arg1 arg2 \n CMD2 arg3 \n CMD3 \n CMD4 arg4 arg5 arg6\nCMD5 ", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"arg1", redisTokenArgument, false}, + {"arg2", redisTokenArgument, false}, + {"CMD2", redisTokenCommand, false}, + {"arg3", redisTokenArgument, false}, + {"CMD3", redisTokenCommand, false}, + {"CMD4", redisTokenCommand, false}, + {"arg4", redisTokenArgument, false}, + {"arg5", redisTokenArgument, false}, + {"arg6", redisTokenArgument, false}, + {"CMD5", redisTokenCommand, true}, + }, + }, + { + in: `CMD ""`, + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {`""`, redisTokenArgument, true}, + }, + }, + { + in: `CMD "foo bar"`, + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {`"foo bar"`, redisTokenArgument, true}, + }, + }, + { + in: `CMD "foo bar\ " baz`, + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {`"foo bar\ "`, redisTokenArgument, false}, + {`baz`, redisTokenArgument, true}, + }, + }, + { + in: "CMD \"foo \n bar\" \"\" baz ", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"\"foo \n bar\"", redisTokenArgument, false}, + {`""`, redisTokenArgument, false}, + {"baz", redisTokenArgument, true}, + }, + }, + { + in: "CMD \"foo \\\" bar\" baz", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {"\"foo \\\" bar\"", redisTokenArgument, false}, + {"baz", redisTokenArgument, true}, + }, + }, + { + in: `CMD "foo bar" baz`, + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {`"foo bar"`, redisTokenArgument, false}, + {`baz`, redisTokenArgument, true}, + }, + }, + { + in: "CMD \"foo bar\" baz\nCMD2 \"baz\\\\bar\"", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {`"foo bar"`, redisTokenArgument, false}, + {`baz`, redisTokenArgument, false}, + {"CMD2", redisTokenCommand, false}, + {`"baz\\bar"`, redisTokenArgument, true}, + }, + }, + { + in: " CMD \"foo bar\" baz \n CMD2 \"baz\\\\bar\" ", + out: []testResult{ + {"CMD", redisTokenCommand, false}, + {`"foo bar"`, redisTokenArgument, false}, + {`baz`, redisTokenArgument, false}, + {"CMD2", redisTokenCommand, false}, + {`"baz\\bar"`, redisTokenArgument, true}, + }, + }, + } { + t.Run(strconv.Itoa(ti), func(t *testing.T) { + tokenizer := newRedisTokenizer([]byte(tt.in)) + for i := 0; i < len(tt.out); i++ { + tok, typ, done := tokenizer.scan() + if done != tt.out[i].done { + t.Fatalf("%d: wanted done: %v, got: %v", i, tt.out[i].done, done) + } + if tok != tt.out[i].tok { + t.Fatalf("%d: wanted token: %q, got: %q", i, tt.out[i].tok, tok) + } + if typ != tt.out[i].typ { + t.Fatalf("%d: wanted type: %s, got: %s", i, tt.out[i].typ, typ) + } + } + }) + } +} diff --git a/pkg/trace/obfuscate/sql.go b/pkg/trace/obfuscate/sql.go new file mode 100644 index 0000000000000..24c007907004a --- /dev/null +++ b/pkg/trace/obfuscate/sql.go @@ -0,0 +1,233 @@ +package obfuscate + +import ( + "bytes" + "errors" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" +) + +const sqlQueryTag = "sql.query" + +// tokenFilter is a generic interface that a sqlObfuscator expects. It defines +// the Filter() function used to filter or replace given tokens. +// A filter can be stateful and keep an internal state to apply the filter later; +// this can be useful to prevent backtracking in some cases. +type tokenFilter interface { + Filter(token, lastToken int, buffer []byte) (int, []byte) + Reset() +} + +// discardFilter implements the tokenFilter interface so that the given +// token is discarded or accepted. +type discardFilter struct{} + +// Filter the given token so that a `nil` slice is returned if the token +// is in the token filtered list. +func (f *discardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { + // filters based on previous token + switch lastToken { + case FilteredBracketedIdentifier: + if token != ']' { + // we haven't found the closing bracket yet, keep going + if token != ID { + // the token between the brackets *must* be an identifier, + // otherwise the query is invalid. + return LexError, nil + } + return FilteredBracketedIdentifier, nil + } + fallthrough + case As: + if token == '[' { + // the identifier followed by AS is an MSSQL bracketed identifier + // and will continue to be discarded until we find the corresponding + // closing bracket counter-part. See GitHub issue #475. + return FilteredBracketedIdentifier, nil + } + // prevent the next comma from being part of a groupingFilter + return FilteredComma, nil + } + + // filters based on the current token; if the next token should be ignored, + // return the same token value (not Filtered) and nil + switch token { + case As: + return As, nil + case Comment, ';': + return Filtered, nil + default: + return token, buffer + } +} + +// Reset in a discardFilter is a noop action +func (f *discardFilter) Reset() {} + +// replaceFilter implements the tokenFilter interface so that the given +// token is replaced with '?' or left unchanged. +type replaceFilter struct{} + +// Filter the given token so that it will be replaced if in the token replacement list +func (f *replaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { + switch lastToken { + case Savepoint: + return Filtered, []byte("?") + } + switch token { + case String, Number, Null, Variable, PreparedStatement, BooleanLiteral, EscapeSequence: + return Filtered, []byte("?") + default: + return token, buffer + } +} + +// Reset in a replaceFilter is a noop action +func (f *replaceFilter) Reset() {} + +// groupingFilter implements the tokenFilter interface so that when +// a common pattern is identified, it's discarded to prevent duplicates +type groupingFilter struct { + groupFilter int + groupMulti int +} + +// Filter the given token so that it will be discarded if a grouping pattern +// has been recognized. A grouping is composed by items like: +// * '( ?, ?, ? )' +// * '( ?, ? ), ( ?, ? )' +func (f *groupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { + // increasing the number of groups means that we're filtering an entire group + // because it can be represented with a single '( ? )' + if (lastToken == '(' && token == Filtered) || (token == '(' && f.groupMulti > 0) { + f.groupMulti++ + } + + switch { + case token == Filtered: + // the previous filter has dropped this token so we should start + // counting the group filter so that we accept only one '?' for + // the same group + f.groupFilter++ + + if f.groupFilter > 1 { + return Filtered, nil + } + case f.groupFilter > 0 && (token == ',' || token == '?'): + // if we are in a group drop all commas + return Filtered, nil + case f.groupMulti > 1: + // drop all tokens since we're in a counting group + // and they're duplicated + return Filtered, nil + case token != ',' && token != '(' && token != ')' && token != Filtered: + // when we're out of a group reset the filter state + f.Reset() + } + + return token, buffer +} + +// Reset in a groupingFilter restores variables used to count +// escaped token that should be filtered +func (f *groupingFilter) Reset() { + f.groupFilter = 0 + f.groupMulti = 0 +} + +// sqlObfuscator is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens +// are available or if a LEX_ERROR is raised. After retrieving a token, it is sent in the +// tokenFilter chains so that the token is discarded or replaced. +type sqlObfuscator struct { + tokenizer *Tokenizer + filters []tokenFilter + lastToken int +} + +// Process the given SQL or No-SQL string so that the resulting one is properly altered. This +// function is generic and the behavior changes according to chosen tokenFilter implementations. +// The process calls all filters inside the []tokenFilter. +func (t *sqlObfuscator) obfuscate(in string) (string, error) { + var out bytes.Buffer + t.reset(in) + token, buff := t.tokenizer.Scan() + for ; token != EOFChar; token, buff = t.tokenizer.Scan() { + if token == LexError { + return "", errors.New("the tokenizer was unable to process the string") + } + for _, f := range t.filters { + if token, buff = f.Filter(token, t.lastToken, buff); token == LexError { + return "", errors.New("the tokenizer was unable to process the string") + } + } + if buff != nil { + if out.Len() != 0 { + switch token { + case ',': + case '=': + if t.lastToken == ':' { + break + } + fallthrough + default: + out.WriteRune(' ') + } + } + out.Write(buff) + } + t.lastToken = token + } + return out.String(), nil +} + +// Reset restores the initial states for all components so that memory can be re-used +func (t *sqlObfuscator) reset(in string) { + t.tokenizer.Reset(in) + for _, f := range t.filters { + f.Reset() + } +} + +// newSQLObfuscator returns a new sqlObfuscator capable to process SQL and No-SQL strings. +func newSQLObfuscator() *sqlObfuscator { + return &sqlObfuscator{ + tokenizer: NewStringTokenizer(""), + filters: []tokenFilter{ + &discardFilter{}, + &replaceFilter{}, + &groupingFilter{}, + }, + } +} + +// QuantizeSQL generates resource and sql.query meta for SQL spans +func (o *Obfuscator) obfuscateSQL(span *pb.Span) { + if span.Resource == "" { + return + } + result, err := o.sql.obfuscate(span.Resource) + if err != nil || result == "" { + // we have an error, discard the SQL to avoid polluting user resources. + log.Debugf("Error parsing SQL query: %q", span.Resource) + if span.Meta == nil { + span.Meta = make(map[string]string, 1) + } + if _, ok := span.Meta[sqlQueryTag]; !ok { + span.Meta[sqlQueryTag] = span.Resource + } + span.Resource = "Non-parsable SQL query" + return + } + + span.Resource = result + + if span.Meta != nil && span.Meta[sqlQueryTag] != "" { + // "sql.query" tag already set by user, do not change it. + return + } + if span.Meta == nil { + span.Meta = make(map[string]string) + } + span.Meta[sqlQueryTag] = result +} diff --git a/pkg/trace/obfuscate/sql_test.go b/pkg/trace/obfuscate/sql_test.go new file mode 100644 index 0000000000000..330d5167eb1b5 --- /dev/null +++ b/pkg/trace/obfuscate/sql_test.go @@ -0,0 +1,488 @@ +package obfuscate + +import ( + "strconv" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +type sqlTestCase struct { + query string + expected string +} + +func SQLSpan(query string) *pb.Span { + return &pb.Span{ + Resource: query, + Type: "sql", + Meta: map[string]string{ + "sql.query": query, + }, + } +} + +func TestSQLResourceQuery(t *testing.T) { + assert := assert.New(t) + span := &pb.Span{ + Resource: "SELECT * FROM users WHERE id = 42", + Type: "sql", + Meta: map[string]string{ + "sql.query": "SELECT * FROM users WHERE id = 42", + }, + } + + NewObfuscator(nil).Obfuscate(span) + assert.Equal("SELECT * FROM users WHERE id = ?", span.Resource) + assert.Equal("SELECT * FROM users WHERE id = 42", span.Meta["sql.query"]) +} + +func TestSQLResourceWithoutQuery(t *testing.T) { + assert := assert.New(t) + span := &pb.Span{ + Resource: "SELECT * FROM users WHERE id = 42", + Type: "sql", + } + + NewObfuscator(nil).Obfuscate(span) + assert.Equal("SELECT * FROM users WHERE id = ?", span.Resource) + assert.Equal("SELECT * FROM users WHERE id = ?", span.Meta["sql.query"]) +} + +func TestSQLResourceWithError(t *testing.T) { + assert := assert.New(t) + testCases := []struct { + span pb.Span + }{ + { + pb.Span{ + Resource: "SELECT * FROM users WHERE id = '' AND '", + Type: "sql", + }, + }, + { + pb.Span{ + Resource: "INSERT INTO pages (id, name) VALUES (%(id0)s, %(name0)s), (%(id1)s, %(name1", + Type: "sql", + }, + }, + { + pb.Span{ + Resource: "INSERT INTO pages (id, name) VALUES (%(id0)s, %(name0)s), (%(id1)s, %(name1)", + Type: "sql", + }, + }, + } + + for _, tc := range testCases { + // copy test cases as Quantize mutates + testSpan := tc.span + + NewObfuscator(nil).Obfuscate(&tc.span) + assert.Equal("Non-parsable SQL query", tc.span.Resource) + assert.Equal(testSpan.Resource, tc.span.Meta["sql.query"]) + } +} + +func TestSQLQuantizer(t *testing.T) { + cases := []sqlTestCase{ + { + "select * from users where id = 42", + "select * from users where id = ?", + }, + { + "SELECT host, status FROM ec2_status WHERE org_id = 42", + "SELECT host, status FROM ec2_status WHERE org_id = ?", + }, + { + "SELECT host, status FROM ec2_status WHERE org_id=42", + "SELECT host, status FROM ec2_status WHERE org_id = ?", + }, + { + "-- get user \n--\n select * \n from users \n where\n id = 214325346", + "select * from users where id = ?", + }, + { + "SELECT * FROM `host` WHERE `id` IN (42, 43) /*comment with parameters,host:localhost,url:controller#home,id:FF005:00CAA*/", + "SELECT * FROM host WHERE id IN ( ? )", + }, + { + "SELECT `host`.`address` FROM `host` WHERE org_id=42", + "SELECT host . address FROM host WHERE org_id = ?", + }, + { + `SELECT "host"."address" FROM "host" WHERE org_id=42`, + `SELECT host . address FROM host WHERE org_id = ?`, + }, + { + `SELECT * FROM host WHERE id IN (42, 43) /* + multiline comment with parameters, + host:localhost,url:controller#home,id:FF005:00CAA + */`, + "SELECT * FROM host WHERE id IN ( ? )", + }, + { + "UPDATE user_dash_pref SET json_prefs = %(json_prefs)s, modified = '2015-08-27 22:10:32.492912' WHERE user_id = %(user_id)s AND url = %(url)s", + "UPDATE user_dash_pref SET json_prefs = ? modified = ? WHERE user_id = ? AND url = ?"}, + { + "SELECT DISTINCT host.id AS host_id FROM host JOIN host_alias ON host_alias.host_id = host.id WHERE host.org_id = %(org_id_1)s AND host.name NOT IN (%(name_1)s) AND host.name IN (%(name_2)s, %(name_3)s, %(name_4)s, %(name_5)s)", + "SELECT DISTINCT host.id FROM host JOIN host_alias ON host_alias.host_id = host.id WHERE host.org_id = ? AND host.name NOT IN ( ? ) AND host.name IN ( ? )", + }, + { + "SELECT org_id, metric_key FROM metrics_metadata WHERE org_id = %(org_id)s AND metric_key = ANY(array[75])", + "SELECT org_id, metric_key FROM metrics_metadata WHERE org_id = ? AND metric_key = ANY ( array [ ? ] )", + }, + { + "SELECT org_id, metric_key FROM metrics_metadata WHERE org_id = %(org_id)s AND metric_key = ANY(array[21, 25, 32])", + "SELECT org_id, metric_key FROM metrics_metadata WHERE org_id = ? AND metric_key = ANY ( array [ ? ] )", + }, + { + "SELECT articles.* FROM articles WHERE articles.id = 1 LIMIT 1", + "SELECT articles.* FROM articles WHERE articles.id = ? LIMIT ?", + }, + + { + "SELECT articles.* FROM articles WHERE articles.id = 1 LIMIT 1, 20", + "SELECT articles.* FROM articles WHERE articles.id = ? LIMIT ?", + }, + { + "SELECT articles.* FROM articles WHERE articles.id = 1 LIMIT 1, 20;", + "SELECT articles.* FROM articles WHERE articles.id = ? LIMIT ?", + }, + { + "SELECT articles.* FROM articles WHERE articles.id = 1 LIMIT 15,20;", + "SELECT articles.* FROM articles WHERE articles.id = ? LIMIT ?", + }, + { + "SELECT articles.* FROM articles WHERE articles.id = 1 LIMIT 1;", + "SELECT articles.* FROM articles WHERE articles.id = ? LIMIT ?", + }, + { + "SELECT articles.* FROM articles WHERE (articles.created_at BETWEEN '2016-10-31 23:00:00.000000' AND '2016-11-01 23:00:00.000000')", + "SELECT articles.* FROM articles WHERE ( articles.created_at BETWEEN ? AND ? )", + }, + { + "SELECT articles.* FROM articles WHERE (articles.created_at BETWEEN $1 AND $2)", + "SELECT articles.* FROM articles WHERE ( articles.created_at BETWEEN ? AND ? )", + }, + { + "SELECT articles.* FROM articles WHERE (articles.published != true)", + "SELECT articles.* FROM articles WHERE ( articles.published != ? )", + }, + { + "SELECT articles.* FROM articles WHERE (title = 'guides.rubyonrails.org')", + "SELECT articles.* FROM articles WHERE ( title = ? )", + }, + { + "SELECT articles.* FROM articles WHERE ( title = ? ) AND ( author = ? )", + "SELECT articles.* FROM articles WHERE ( title = ? ) AND ( author = ? )", + }, + { + "SELECT articles.* FROM articles WHERE ( title = :title )", + "SELECT articles.* FROM articles WHERE ( title = :title )", + }, + { + "SELECT articles.* FROM articles WHERE ( title = @title )", + "SELECT articles.* FROM articles WHERE ( title = @title )", + }, + { + "SELECT date(created_at) as ordered_date, sum(price) as total_price FROM orders GROUP BY date(created_at) HAVING sum(price) > 100", + "SELECT date ( created_at ), sum ( price ) FROM orders GROUP BY date ( created_at ) HAVING sum ( price ) > ?", + }, + { + "SELECT * FROM articles WHERE id > 10 ORDER BY id asc LIMIT 20", + "SELECT * FROM articles WHERE id > ? ORDER BY id asc LIMIT ?", + }, + { + "SELECT clients.* FROM clients INNER JOIN posts ON posts.author_id = author.id AND posts.published = 't'", + "SELECT clients.* FROM clients INNER JOIN posts ON posts.author_id = author.id AND posts.published = ?", + }, + { + "SELECT articles.* FROM articles WHERE articles.id IN (1, 3, 5)", + "SELECT articles.* FROM articles WHERE articles.id IN ( ? )", + }, + { + "SELECT * FROM clients WHERE (clients.first_name = 'Andy') LIMIT 1 BEGIN INSERT INTO clients (created_at, first_name, locked, orders_count, updated_at) VALUES ('2011-08-30 05:22:57', 'Andy', 1, NULL, '2011-08-30 05:22:57') COMMIT", + "SELECT * FROM clients WHERE ( clients.first_name = ? ) LIMIT ? BEGIN INSERT INTO clients ( created_at, first_name, locked, orders_count, updated_at ) VALUES ( ? ) COMMIT", + }, + { + "SELECT * FROM clients WHERE (clients.first_name = 'Andy') LIMIT 15, 25 BEGIN INSERT INTO clients (created_at, first_name, locked, orders_count, updated_at) VALUES ('2011-08-30 05:22:57', 'Andy', 1, NULL, '2011-08-30 05:22:57') COMMIT", + "SELECT * FROM clients WHERE ( clients.first_name = ? ) LIMIT ? BEGIN INSERT INTO clients ( created_at, first_name, locked, orders_count, updated_at ) VALUES ( ? ) COMMIT", + }, + { + "SAVEPOINT \"s139956586256192_x1\"", + "SAVEPOINT ?", + }, + { + "INSERT INTO user (id, username) VALUES ('Fred','Smith'), ('John','Smith'), ('Michael','Smith'), ('Robert','Smith');", + "INSERT INTO user ( id, username ) VALUES ( ? )", + }, + { + "CREATE KEYSPACE Excelsior WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3};", + "CREATE KEYSPACE Excelsior WITH replication = ?", + }, + { + `SELECT "webcore_page"."id" FROM "webcore_page" WHERE "webcore_page"."slug" = %s ORDER BY "webcore_page"."path" ASC LIMIT 1`, + "SELECT webcore_page . id FROM webcore_page WHERE webcore_page . slug = ? ORDER BY webcore_page . path ASC LIMIT ?", + }, + { + "SELECT server_table.host AS host_id FROM table#.host_tags as server_table WHERE server_table.host_id = 50", + "SELECT server_table.host FROM table#.host_tags WHERE server_table.host_id = ?", + }, + { + `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n store:\n - a simple string\n - an \'escaped \' string\n - another \'escaped\' string\n - 42\n string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`, + "INSERT INTO delayed_jobs ( attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at ) VALUES ( ? )", + }, + { + "SELECT name, pretty_print(address) FROM people;", + "SELECT name, pretty_print ( address ) FROM people", + }, + { + "* SELECT * FROM fake_data(1, 2, 3);", + "* SELECT * FROM fake_data ( ? )", + }, + { + "CREATE FUNCTION add(integer, integer) RETURNS integer\n AS 'select $1 + $2;'\n LANGUAGE SQL\n IMMUTABLE\n RETURNS NULL ON NULL INPUT;", + "CREATE FUNCTION add ( integer, integer ) RETURNS integer LANGUAGE SQL IMMUTABLE RETURNS ? ON ? INPUT", + }, + { + "SELECT * FROM public.table ( array [ ROW ( array [ 'magic', 'foo',", + "SELECT * FROM public.table ( array [ ROW ( array [ ?", + }, + { + "SELECT pg_try_advisory_lock (123) AS t46eef3f025cc27feb31ca5a2d668a09a", + "SELECT pg_try_advisory_lock ( ? )", + }, + { + "INSERT INTO `qual-aa`.issues (alert0 , alert1) VALUES (NULL, NULL)", + "INSERT INTO qual-aa . issues ( alert0, alert1 ) VALUES ( ? )", + }, + { + "INSERT INTO user (id, email, name) VALUES (null, ?, ?)", + "INSERT INTO user ( id, email, name ) VALUES ( ? )", + }, + { + "select * from users where id = 214325346 # This comment continues to the end of line", + "select * from users where id = ?", + }, + { + "select * from users where id = 214325346 -- This comment continues to the end of line", + "select * from users where id = ?", + }, + { + "SELECT * FROM /* this is an in-line comment */ users;", + "SELECT * FROM users", + }, + { + "SELECT /*! STRAIGHT_JOIN */ col1 FROM table1", + "SELECT col1 FROM table1", + }, + { + `DELETE FROM t1 + WHERE s11 > ANY + (SELECT COUNT(*) /* no hint */ FROM t2 + WHERE NOT EXISTS + (SELECT * FROM t3 + WHERE ROW(5*t2.s1,77)= + (SELECT 50,11*s1 FROM t4 UNION SELECT 50,77 FROM + (SELECT * FROM t5) AS t5)));`, + "DELETE FROM t1 WHERE s11 > ANY ( SELECT COUNT ( * ) FROM t2 WHERE NOT EXISTS ( SELECT * FROM t3 WHERE ROW ( ? * t2.s1, ? ) = ( SELECT ? * s1 FROM t4 UNION SELECT ? FROM ( SELECT * FROM t5 ) ) ) )", + }, + { + "SET @g = 'POLYGON((0 0,10 0,10 10,0 10,0 0),(5 5,7 5,7 7,5 7, 5 5))';", + "SET @g = ?", + }, + { + `SELECT daily_values.*, + LEAST((5040000 - @runtot), value) AS value, + ` + "(@runtot := @runtot + daily_values.value) AS total FROM (SELECT @runtot:=0) AS n, `daily_values` WHERE `daily_values`.`subject_id` = 12345 AND `daily_values`.`subject_type` = 'Skippity' AND (daily_values.date BETWEEN '2018-05-09' AND '2018-06-19') HAVING value >= 0 ORDER BY date", + `SELECT daily_values.*, LEAST ( ( ? - @runtot ), value ), ( @runtot := @runtot + daily_values.value ) FROM ( SELECT @runtot := ? ), daily_values WHERE daily_values . subject_id = ? AND daily_values . subject_type = ? AND ( daily_values.date BETWEEN ? AND ? ) HAVING value >= ? ORDER BY date`, + }, + { + ` SELECT + t1.userid, + t1.fullname, + t1.firm_id, + t2.firmname, + t1.email, + t1.location, + t1.state, + t1.phone, + t1.url, + DATE_FORMAT( t1.lastmod, "%m/%d/%Y %h:%i:%s" ) AS lastmod, + t1.lastmod AS lastmod_raw, + t1.user_status, + t1.pw_expire, + DATE_FORMAT( t1.pw_expire, "%m/%d/%Y" ) AS pw_expire_date, + t1.addr1, + t1.addr2, + t1.zipcode, + t1.office_id, + t1.default_group, + t3.firm_status, + t1.title + FROM + userdata AS t1 + LEFT JOIN lawfirm_names AS t2 ON t1.firm_id = t2.firm_id + LEFT JOIN lawfirms AS t3 ON t1.firm_id = t3.firm_id + WHERE + t1.userid = 'jstein' + + `, + `SELECT t1.userid, t1.fullname, t1.firm_id, t2.firmname, t1.email, t1.location, t1.state, t1.phone, t1.url, DATE_FORMAT ( t1.lastmod, %m/%d/%Y %h:%i:%s ), t1.lastmod, t1.user_status, t1.pw_expire, DATE_FORMAT ( t1.pw_expire, %m/%d/%Y ), t1.addr1, t1.addr2, t1.zipcode, t1.office_id, t1.default_group, t3.firm_status, t1.title FROM userdata LEFT JOIN lawfirm_names ON t1.firm_id = t2.firm_id LEFT JOIN lawfirms ON t1.firm_id = t3.firm_id WHERE t1.userid = ?`, + }, + { + `SELECT [b].[BlogId], [b].[Name] +FROM [Blogs] AS [b] +ORDER BY [b].[Name]`, + `SELECT [ b ] . [ BlogId ], [ b ] . [ Name ] FROM [ Blogs ] ORDER BY [ b ] . [ Name ]`, + }, + { + `SELECT * FROM users WHERE firstname=''`, + `SELECT * FROM users WHERE firstname = ?`, + }, + { + `SELECT * FROM users WHERE firstname=' '`, + `SELECT * FROM users WHERE firstname = ?`, + }, + { + `SELECT * FROM users WHERE firstname=""`, + `SELECT * FROM users WHERE firstname = ""`, + }, + { + `SELECT * FROM users WHERE lastname=" "`, + `SELECT * FROM users WHERE lastname = ""`, + }, + { + `SELECT * FROM users WHERE lastname=" "`, + `SELECT * FROM users WHERE lastname = ""`, + }, + { + `SELECT [b].[BlogId], [b].[Name] +FROM [Blogs] AS [b +ORDER BY [b].[Name]`, + `Non-parsable SQL query`, + }, + } + + for i, c := range cases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + s := SQLSpan(c.query) + NewObfuscator(nil).Obfuscate(s) + assert.Equal(t, c.expected, s.Resource) + }) + } +} + +func TestMultipleProcess(t *testing.T) { + assert := assert.New(t) + + testCases := []struct { + query string + expected string + }{ + { + "SELECT clients.* FROM clients INNER JOIN posts ON posts.author_id = author.id AND posts.published = 't'", + "SELECT clients.* FROM clients INNER JOIN posts ON posts.author_id = author.id AND posts.published = ?", + }, + { + "SELECT articles.* FROM articles WHERE articles.id IN (1, 3, 5)", + "SELECT articles.* FROM articles WHERE articles.id IN ( ? )", + }, + } + + // The consumer is the same between executions + obf := newSQLObfuscator() + + for _, tc := range testCases { + output, err := obf.obfuscate(tc.query) + assert.Nil(err) + assert.Equal(tc.expected, output) + } +} + +func TestConsumerError(t *testing.T) { + assert := assert.New(t) + + // Malformed SQL is not accepted and the outer component knows + // what to do with malformed SQL + input := "SELECT * FROM users WHERE users.id = '1 AND users.name = 'dog'" + obf := newSQLObfuscator() + + output, err := obf.obfuscate(input) + assert.NotNil(err) + assert.Equal("", output) +} + +// Benchmark the Tokenizer using a SQL statement +func BenchmarkTokenizer(b *testing.B) { + benchmarks := []struct { + name string + query string + }{ + {"Escaping", `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n store:\n - a simple string\n - an \'escaped \' string\n - another \'escaped\' string\n - 42\n string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`}, + {"Grouping", `INSERT INTO delayed_jobs (created_at, failed_at, handler) VALUES (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL)`}, + } + obf := newSQLObfuscator() + + for _, bm := range benchmarks { + b.Run(bm.name+"/"+strconv.Itoa(len(bm.query)), func(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _, _ = obf.obfuscate(bm.query) + } + }) + } +} + +func CassSpan(query string) *pb.Span { + return &pb.Span{ + Resource: query, + Type: "cassandra", + Meta: map[string]string{ + "query": query, + }, + } +} + +func TestCassQuantizer(t *testing.T) { + assert := assert.New(t) + + queryToExpected := []struct{ in, expected string }{ + // List compacted and replaced + { + "select key, status, modified from org_check_run where org_id = %s and check in (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", + "select key, status, modified from org_check_run where org_id = ? and check in ( ? )", + }, + // Some whitespace-y things + { + "select key, status, modified from org_check_run where org_id = %s and check in (%s, %s, %s)", + "select key, status, modified from org_check_run where org_id = ? and check in ( ? )", + }, + { + "select key, status, modified from org_check_run where org_id = %s and check in (%s , %s , %s )", + "select key, status, modified from org_check_run where org_id = ? and check in ( ? )", + }, + // %s replaced with ? as in sql quantize + { + "select key, status, modified from org_check_run where org_id = %s and check = %s", + "select key, status, modified from org_check_run where org_id = ? and check = ?", + }, + { + "select key, status, modified from org_check_run where org_id = %s and check = %s", + "select key, status, modified from org_check_run where org_id = ? and check = ?", + }, + { + "SELECT timestamp, processes FROM process_snapshot.minutely WHERE org_id = ? AND host = ? AND timestamp >= ? AND timestamp <= ?", + "SELECT timestamp, processes FROM process_snapshot.minutely WHERE org_id = ? AND host = ? AND timestamp >= ? AND timestamp <= ?", + }, + } + + for _, testCase := range queryToExpected { + s := CassSpan(testCase.in) + NewObfuscator(nil).Obfuscate(s) + assert.Equal(testCase.expected, s.Resource) + } +} diff --git a/pkg/trace/obfuscate/sql_tokenizer.go b/pkg/trace/obfuscate/sql_tokenizer.go new file mode 100644 index 0000000000000..757d1f56b290f --- /dev/null +++ b/pkg/trace/obfuscate/sql_tokenizer.go @@ -0,0 +1,501 @@ +package obfuscate + +import ( + "bytes" + "strings" + "unicode" +) + +// tokenizer.go implemenents a lexer-like iterator that tokenizes SQL and CQL +// strings, so that an external component can filter or alter each token of the +// string. This implementation can't be used as a real SQL lexer (so a parser +// cannot build the AST) because many rules are ignored to make the tokenizer +// simpler. +// This implementation was inspired by https://github.com/youtube/vitess sql parser +// TODO: add the license to the NOTICE file + +// list of available tokens; this list has been reduced because we don't +// need a full-fledged tokenizer to implement a Lexer +const ( + EOFChar = 0x100 + LexError = 57346 + ID = 57347 + Limit = 57348 + Null = 57349 + String = 57350 + Number = 57351 + BooleanLiteral = 57352 + ValueArg = 57353 + ListArg = 57354 + Comment = 57355 + Variable = 57356 + Savepoint = 57357 + PreparedStatement = 57358 + EscapeSequence = 57359 + NullSafeEqual = 57360 + LE = 57361 + GE = 57362 + NE = 57363 + As = 57365 + + // Filtered specifies that the given token has been discarded by one of the + // token filters. + Filtered = 57364 + + // FilteredComma specifies that the token is a comma and was discarded by one + // of the filters. + FilteredComma = 57366 + + // FilteredBracketedIdentifier specifies that we are currently discarding + // a bracketed identifier (MSSQL). + // See issue https://github.com/DataDog/datadog-trace-agent/issues/475. + FilteredBracketedIdentifier = 57367 +) + +// Tokenizer is the struct used to generate SQL +// tokens for the parser. +type Tokenizer struct { + InStream *strings.Reader + Position int + lastChar uint16 +} + +// NewStringTokenizer creates a new Tokenizer for the +// sql string. +func NewStringTokenizer(sql string) *Tokenizer { + return &Tokenizer{InStream: strings.NewReader(sql)} +} + +// Reset the underlying buffer and positions +func (tkn *Tokenizer) Reset(in string) { + tkn.InStream.Reset(in) + tkn.Position = 0 + tkn.lastChar = 0 +} + +// keywords used to recognize string tokens +var keywords = map[string]int{ + "NULL": Null, + "TRUE": BooleanLiteral, + "FALSE": BooleanLiteral, + "SAVEPOINT": Savepoint, + "LIMIT": Limit, + "AS": As, +} + +// Scan scans the tokenizer for the next token and returns +// the token type and the token buffer. +// TODO[manu]: the current implementation returns a new Buffer +// for each Scan(). An improvement to reduce the overhead of +// the Scan() is to return slices instead of buffers. +func (tkn *Tokenizer) Scan() (int, []byte) { + if tkn.lastChar == 0 { + tkn.next() + } + tkn.skipBlank() + + switch ch := tkn.lastChar; { + case isLeadingLetter(ch): + return tkn.scanIdentifier() + case isDigit(ch): + return tkn.scanNumber(false) + default: + tkn.next() + switch ch { + case EOFChar: + return EOFChar, nil + case ':': + if tkn.lastChar != '=' { + return tkn.scanBindVar() + } + fallthrough + case '=', ',', ';', '(', ')', '+', '*', '&', '|', '^', '~', '[', ']', '?': + return int(ch), []byte{byte(ch)} + case '.': + if isDigit(tkn.lastChar) { + return tkn.scanNumber(true) + } + return int(ch), []byte{byte(ch)} + case '/': + switch tkn.lastChar { + case '/': + tkn.next() + return tkn.scanCommentType1("//") + case '*': + tkn.next() + return tkn.scanCommentType2() + default: + return int(ch), []byte{byte(ch)} + } + case '-': + if tkn.lastChar == '-' { + tkn.next() + return tkn.scanCommentType1("--") + } + return int(ch), []byte{byte(ch)} + case '#': + tkn.next() + return tkn.scanCommentType1("#") + case '<': + switch tkn.lastChar { + case '>': + tkn.next() + return NE, []byte("<>") + case '=': + tkn.next() + switch tkn.lastChar { + case '>': + tkn.next() + return NullSafeEqual, []byte("<=>") + default: + return LE, []byte("<=") + } + default: + return int(ch), []byte{byte(ch)} + } + case '>': + if tkn.lastChar == '=' { + tkn.next() + return GE, []byte(">=") + } + return int(ch), []byte{byte(ch)} + case '!': + if tkn.lastChar == '=' { + tkn.next() + return NE, []byte("!=") + } + return LexError, []byte("!") + case '\'': + return tkn.scanString(ch, String) + case '"': + return tkn.scanString(ch, ID) + case '`': + return tkn.scanLiteralIdentifier('`') + case '%': + if tkn.lastChar == '(' { + return tkn.scanVariableIdentifier('%') + } + return tkn.scanFormatParameter('%') + case '$': + return tkn.scanPreparedStatement('$') + case '{': + return tkn.scanEscapeSequence('{') + default: + return LexError, []byte{byte(ch)} + } + } +} + +func (tkn *Tokenizer) skipBlank() { + ch := tkn.lastChar + for ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' { + tkn.next() + ch = tkn.lastChar + } +} + +func (tkn *Tokenizer) scanIdentifier() (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + + for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' || tkn.lastChar == '*' { + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + } + upper := bytes.ToUpper(buffer.Bytes()) + if keywordID, found := keywords[string(upper)]; found { + return keywordID, upper + } + return ID, buffer.Bytes() +} + +func (tkn *Tokenizer) scanLiteralIdentifier(quote rune) (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteByte(byte(tkn.lastChar)) + if !isLetter(tkn.lastChar) { + return LexError, buffer.Bytes() + } + for tkn.next(); skipNonLiteralIdentifier(tkn.lastChar); tkn.next() { + buffer.WriteByte(byte(tkn.lastChar)) + } + // literals identifier are enclosed between quotes + if tkn.lastChar != uint16(quote) { + return LexError, buffer.Bytes() + } + tkn.next() + return ID, buffer.Bytes() +} + +func (tkn *Tokenizer) scanVariableIdentifier(prefix rune) (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteRune(prefix) + buffer.WriteByte(byte(tkn.lastChar)) + + // expects that the variable is enclosed between '(' and ')' parenthesis + if tkn.lastChar != '(' { + return LexError, buffer.Bytes() + } + for tkn.next(); tkn.lastChar != ')' && tkn.lastChar != EOFChar; tkn.next() { + buffer.WriteByte(byte(tkn.lastChar)) + } + + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + buffer.WriteByte(byte(tkn.lastChar)) + if !isLetter(tkn.lastChar) { + return LexError, buffer.Bytes() + } + tkn.next() + return Variable, buffer.Bytes() +} + +func (tkn *Tokenizer) scanFormatParameter(prefix rune) (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteRune(prefix) + buffer.WriteByte(byte(tkn.lastChar)) + + // a format parameter is like '%s' so it should be a letter otherwise + // we're having something different + if !isLetter(tkn.lastChar) { + return LexError, buffer.Bytes() + } + + tkn.next() + return Variable, buffer.Bytes() +} + +func (tkn *Tokenizer) scanPreparedStatement(prefix rune) (int, []byte) { + buffer := &bytes.Buffer{} + + // a prepared statement expect a digit identifier like $1 + if !isDigit(tkn.lastChar) { + return LexError, buffer.Bytes() + } + + // read numbers and return an error if any + token, buff := tkn.scanNumber(false) + if token == LexError { + return LexError, buffer.Bytes() + } + + buffer.WriteRune(prefix) + buffer.Write(buff) + return PreparedStatement, buffer.Bytes() +} + +func (tkn *Tokenizer) scanEscapeSequence(braces rune) (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteByte(byte(braces)) + + for tkn.lastChar != '}' && tkn.lastChar != EOFChar { + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + } + + // we've reached the end of the string without finding + // the closing curly braces + if tkn.lastChar == EOFChar { + return LexError, buffer.Bytes() + } + + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + return EscapeSequence, buffer.Bytes() +} + +func (tkn *Tokenizer) scanBindVar() (int, []byte) { + buffer := bytes.NewBufferString(":") + token := ValueArg + if tkn.lastChar == ':' { + token = ListArg + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + } + if !isLetter(tkn.lastChar) { + return LexError, buffer.Bytes() + } + for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' { + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() + } + return token, buffer.Bytes() +} + +func (tkn *Tokenizer) scanMantissa(base int, buffer *bytes.Buffer) { + for digitVal(tkn.lastChar) < base { + tkn.consumeNext(buffer) + } +} + +func (tkn *Tokenizer) scanNumber(seenDecimalPoint bool) (int, []byte) { + buffer := &bytes.Buffer{} + if seenDecimalPoint { + buffer.WriteByte('.') + tkn.scanMantissa(10, buffer) + goto exponent + } + + if tkn.lastChar == '0' { + // int or float + tkn.consumeNext(buffer) + if tkn.lastChar == 'x' || tkn.lastChar == 'X' { + // hexadecimal int + tkn.consumeNext(buffer) + tkn.scanMantissa(16, buffer) + } else { + // octal int or float + seenDecimalDigit := false + tkn.scanMantissa(8, buffer) + if tkn.lastChar == '8' || tkn.lastChar == '9' { + // illegal octal int or float + seenDecimalDigit = true + tkn.scanMantissa(10, buffer) + } + if tkn.lastChar == '.' || tkn.lastChar == 'e' || tkn.lastChar == 'E' { + goto fraction + } + // octal int + if seenDecimalDigit { + return LexError, buffer.Bytes() + } + } + goto exit + } + + // decimal int or float + tkn.scanMantissa(10, buffer) + +fraction: + if tkn.lastChar == '.' { + tkn.consumeNext(buffer) + tkn.scanMantissa(10, buffer) + } + +exponent: + if tkn.lastChar == 'e' || tkn.lastChar == 'E' { + tkn.consumeNext(buffer) + if tkn.lastChar == '+' || tkn.lastChar == '-' { + tkn.consumeNext(buffer) + } + tkn.scanMantissa(10, buffer) + } + +exit: + return Number, buffer.Bytes() +} + +func (tkn *Tokenizer) scanString(delim uint16, typ int) (int, []byte) { + buffer := &bytes.Buffer{} + for { + ch := tkn.lastChar + tkn.next() + if ch == delim { + if tkn.lastChar == delim { + tkn.next() + } else { + break + } + } else if ch == '\\' { + if tkn.lastChar == EOFChar { + return LexError, buffer.Bytes() + } + + ch = tkn.lastChar + tkn.next() + } + if ch == EOFChar { + return LexError, buffer.Bytes() + } + buffer.WriteByte(byte(ch)) + } + buf := buffer.Bytes() + if typ == ID && len(buf) == 0 || bytes.IndexFunc(buf, func(r rune) bool { return !unicode.IsSpace(r) }) == -1 { + // This string is an empty or white-space only identifier. + // We should keep the start and end delimiters in order to + // avoid creating invalid queries. + // See: https://github.com/DataDog/datadog-trace-agent/issues/316 + return typ, []byte{byte(delim), byte(delim)} + } + return typ, buf +} + +func (tkn *Tokenizer) scanCommentType1(prefix string) (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteString(prefix) + for tkn.lastChar != EOFChar { + if tkn.lastChar == '\n' { + tkn.consumeNext(buffer) + break + } + tkn.consumeNext(buffer) + } + return Comment, buffer.Bytes() +} + +func (tkn *Tokenizer) scanCommentType2() (int, []byte) { + buffer := &bytes.Buffer{} + buffer.WriteString("/*") + for { + if tkn.lastChar == '*' { + tkn.consumeNext(buffer) + if tkn.lastChar == '/' { + tkn.consumeNext(buffer) + break + } + continue + } + if tkn.lastChar == EOFChar { + return LexError, buffer.Bytes() + } + tkn.consumeNext(buffer) + } + return Comment, buffer.Bytes() +} + +func (tkn *Tokenizer) consumeNext(buffer *bytes.Buffer) { + if tkn.lastChar == EOFChar { + // This should never happen. + panic("unexpected EOF") + } + buffer.WriteByte(byte(tkn.lastChar)) + tkn.next() +} + +func (tkn *Tokenizer) next() { + if ch, err := tkn.InStream.ReadByte(); err != nil { + // Only EOF is possible. + tkn.lastChar = EOFChar + } else { + tkn.lastChar = uint16(ch) + } + tkn.Position++ +} + +func skipNonLiteralIdentifier(ch uint16) bool { + return isLetter(ch) || isDigit(ch) || '.' == ch || '-' == ch +} + +func isLeadingLetter(ch uint16) bool { + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch == '@' +} + +func isLetter(ch uint16) bool { + return isLeadingLetter(ch) || ch == '#' +} + +func digitVal(ch uint16) int { + switch { + case '0' <= ch && ch <= '9': + return int(ch) - '0' + case 'a' <= ch && ch <= 'f': + return int(ch) - 'a' + 10 + case 'A' <= ch && ch <= 'F': + return int(ch) - 'A' + 10 + } + return 16 // larger than any legal digit val +} + +func isDigit(ch uint16) bool { + return '0' <= ch && ch <= '9' +} diff --git a/pkg/trace/obfuscate/testdata/json_tests.xml b/pkg/trace/obfuscate/testdata/json_tests.xml new file mode 100644 index 0000000000000..98ff5364f8a48 --- /dev/null +++ b/pkg/trace/obfuscate/testdata/json_tests.xml @@ -0,0 +1,490 @@ + + + + + + + elasticsearch.body + + { "query": { "multi_match" : { "query" : "guide", "fields" : ["_all", { "key": "value", "other": ["1", "2", {"k": "v"}] }, "2"] } } } + { "query": { "multi_match": { "query": "?", "fields" : ["?", { "key": "?", "other": ["?", "?", {"k": "?"}] }, "?"] } } } + + + + + + elasticsearch.body + + " ], + "post_tags": [ "" ], + "index": 1 + } +} + ]]> + + + + + + + elasticsearch.body + + other + + { "query": { "multi_match" : { "query" : "guide", "fields" : ["_all", { "key": "value", "other": ["1", "2", {"k": "v"}] }, "2"] } } } + { "query": { "multi_match": { "query": "?", "fields" : ["?", { "key": "?", "other": ["1", "2", {"k": "v"}] }, "?"] } } } + + + + + + elasticsearch.body + fields + {"fields" : ["_all", { "key": "value", "other": ["1", "2", {"k": "v"}] }, "2"]} + {"fields" : ["_all", { "key": "value", "other": ["1", "2", {"k": "v"}] }, "2"]} + + + + + + elasticsearch.body + k + {"fields" : ["_all", { "key": "value", "other": ["1", "2", {"k": "v"}] }, "2"]} + {"fields" : ["?", { "key": "?", "other": ["?", "?", {"k": "v"}] }, "?"]} + + + + + + elasticsearch.body + C + {"fields" : [{"A": 1, "B": {"C": 3}}, "2"]} + {"fields" : [{"A": "?", "B": {"C": 3}}, "?"]} + + + + + + elasticsearch.body + +{ + "query": { + "match" : { + "title" : "in action" + } + }, + "size": 2, + "from": 0, + "_source": [ "title", "summary", "publish_date" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + +{ + "query": { + "match" : { + "title" : "?" + } + }, + "size": "?", + "from": "?", + "_source": [ "?", "?", "?" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + + + + + + elasticsearch.body + _source + +{ + "query": { + "match" : { + "title" : "in action" + } + }, + "size": 2, + "from": 0, + "_source": [ "title", "summary", "publish_date" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + +{ + "query": { + "match" : { + "title" : "?" + } + }, + "size": "?", + "from": "?", + "_source": [ "title", "summary", "publish_date" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + + + + + + elasticsearch.body + query + +{ + "query": { + "match" : { + "title" : "in action" + } + }, + "size": 2, + "from": 0, + "_source": [ "title", "summary", "publish_date" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + +{ + "query": { + "match" : { + "title" : "in action" + } + }, + "size": "?", + "from": "?", + "_source": [ "?", "?", "?" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + + + + + + elasticsearch.body + match + +{ + "query": { + "match" : { + "title" : "in action" + } + }, + "size": 2, + "from": 0, + "_source": [ "title", "summary", "publish_date" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + +{ + "query": { + "match" : { + "title" : "in action" + } + }, + "size": "?", + "from": "?", + "_source": [ "?", "?", "?" ], + "highlight": { + "fields" : { + "title" : {} + } + } +} + + + + + + + elasticsearch.body + + hits + + + + + + + + + elasticsearch.body + + _index + title + + +{ + "hits": { + "total": 2, + "max_score": 0.9105287, + "hits": [ + { + "_index": "bookdb_index", + "_type": "book", + "_id": "3", + "_score": 0.9105287, + "_source": { + "summary": "build scalable search applications using Elasticsearch without having to do complex low-level programming or understand advanced data science algorithms", + "title": "Elasticsearch in Action", + "publish_date": "2015-12-03" + }, + "highlight": { + "title": [ + "Elasticsearch Action" + ] + } + }, + { + "_index": "bookdb_index", + "_type": "book", + "_id": "4", + "_score": 0.9105287, + "_source": { + "summary": "Comprehensive guide to implementing a scalable search engine using Apache Solr", + "title": "Solr in Action", + "publish_date": "2014-04-05" + }, + "highlight": { + "title": [ + "Solr Action" + ] + } + } + ] + } +} + + +{ + "hits": { + "total": "?", + "max_score": "?", + "hits": [ + { + "_index": "bookdb_index", + "_type": "?", + "_id": "?", + "_score": "?", + "_source": { + "summary": "?", + "title": "Elasticsearch in Action", + "publish_date": "?" + }, + "highlight": { + "title": [ + "Elasticsearch Action" + ] + } + }, + { + "_index": "bookdb_index", + "_type": "?", + "_id": "?", + "_score": "?", + "_source": { + "summary": "?", + "title": "Solr in Action", + "publish_date": "?" + }, + "highlight": { + "title": [ + "Solr Action" + ] + } + } + ] + } +} + + + + elasticsearch.body + + _source + + " ], "post_tags": [ "" ], + "fields": { + "body": { "number_of_fragments": 1, "fragment_size": 20 }, + "title": {} + } + }, + "size": 20, + "from": 100, + "_source": [ "title", "id" ], + "sort": [ { "_id": { "order": "desc" } } ] +} +]]> + + + + elasticsearch.body + true + {"index":{"_index":"traces.v2.2018.06.29.11","_routing":"2:-1851516970739438017","_type":"trace"}} {"trace_id":-1851516970739438017,"span":[{"service":"master-db","name":"postgres.query","resource":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","duration":532865,"error":0,"meta":{"db.application":"brokerstate","db.name":"dogdatastaging","db.user":"None","out.host":"''","out.port":"6432","sql.query":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","system.pid":"23463"},"metrics":{"_sample_rate":0.08579267671651072,"_sampling_priority_v1":1,"_top_level":1,"db.rowcount":1},"type":"sql","resource_hash":"633ad3800be7ec31","start":"2018-06-29T11:30:49.021115904Z","end":"2018-06-29T11:30:49.021648769Z","trace_id":-1851516970739438017,"span_id":3635861121986229119,"parent_id":0,"is_root":true}],"org_id":2,"host_id":2832410,"start":"2018-06-29T11:30:49.021115904Z","end":"2018-06-29T11:30:49.021648769Z","env":"staging","host_groups":["availability-zone:us-east-1a","env:staging"]} {"index":{"_index":"traces.v2.2018.06.29.11","_routing":"2:-7171575148150503216","_type":"trace"}} {"trace_id":-7171575148150503216,"span":[{"service":"master-db","name":"postgres.query","resource":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","duration":541925,"error":0,"meta":{"db.application":"brokerstate","db.name":"dogdatastaging","db.user":"None","out.host":"''","out.port":"6432","sql.query":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","system.pid":"23463"},"metrics":{"_sample_rate":0.02845090898763012,"_sampling_priority_v1":1,"_top_level":1,"db.rowcount":1},"type":"sql","resource_hash":"633ad3800be7ec31","start":"2018-06-29T11:30:49.870599936Z","end":"2018-06-29T11:30:49.871141861Z","trace_id":-7171575148150503216,"span_id":-4982373041719473893,"parent_id":0,"is_root":true}],"org_id":2,"host_id":2832410,"start":"2018-06-29T11:30:49.870599936Z","end":"2018-06-29T11:30:49.871141861Z","env":"staging","host_groups":["availability-zone:us-east-1a","env:staging"]} {"index":{"_index":"traces.v2.2018.06.29.11","_routing":"2:3438931145341397782","_type":"trace"}} {"trace_id":3438931145341397782,"span":[{"service":"master-db","name":"postgres.query","resource":"begin","duration":1988172,"error":0,"meta":{"db.application":"brokerstate","db.name":"dogdatastaging","db.user":"None","out.host":"''","out.port":"6432","sql.query":"begin","system.pid":"23463"},"metrics":{"_sample_rate":1,"_sampling_priority_v1":1,"_top_level":1,"db.rowcount":-1},"type":"sql","resource_hash":"fc747ae36f14c50d","start":"2018-06-29T11:30:48.886354944Z","end":"2018-06-29T11:30:48.888343116Z","trace_id":3438931145341397782,"span_id":8432748882772113994,"parent_id":0,"is_root":true}],"org_id":2,"host_id":2832410,"start":"2018-06-29T11:30:48.886354944Z","end":"2018-06-29T11:30:48.888343116Z","env":"staging","host_groups":["availability-zone:us-east-1a","env:staging"]} {"index":{"_index":"traces.v2.2018.06.29.11","_routing":"2:-2942210836778233450","_type":"trace"}} {"trace_id":-2942210836778233450,"span":[{"service":"master-db","name":"postgres.query","resource":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","duration":538825,"error":0,"meta":{"db.application":"brokerstate","db.name":"dogdatastaging","db.user":"None","out.host":"''","out.port":"6432","sql.query":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","system.pid":"23463"},"metrics":{"_sample_rate":0.09493583930982655,"_sampling_priority_v1":1,"_top_level":1,"db.rowcount":1},"type":"sql","resource_hash":"633ad3800be7ec31","start":"2018-06-29T11:30:48.995932928Z","end":"2018-06-29T11:30:48.996471753Z","trace_id":-2942210836778233450,"span_id":1801908560308090622,"parent_id":0,"is_root":true}],"org_id":2,"host_id":2832410,"start":"2018-06-29T11:30:48.995932928Z","end":"2018-06-29T11:30:48.996471753Z","env":"staging","host_groups":["availability-zone:us-east-1a","env:staging"]} {"index":{"_index":"traces.v2.2018.06.29.11","_routing":"2:1154462040005386081","_type":"trace"}} {"trace_id":1154462040005386081,"span":[{"service":"master-db","name":"postgres.query","resource":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","duration":16173124,"error":0,"meta":{"db.application":"brokerstate","db.name":"dogdatastaging","db.user":"None","out.host":"''","out.port":"6432","sql.query":"INSERT INTO kafka_broker_state ( broker_id, topic, partition, latest_offset, kafka_version ) VALUES ( ? )","system.pid":"23463"},"metrics":{"_sample_rate":0.03305929657743924,"_sampling_priority_v1":1,"_top_level":1,"db.rowcount":1},"type":"sql","resource_hash":"633ad3800be7ec31","start":"2018-06-29T11:30:49.730038784Z","end":"2018-... + {"index":{"_index":"?","_routing":"?","_type":"?"}} {"trace_id":"?","span":[{"service":"?","name":"?","resource":"?","duration":"?","error":"?","meta":{"db.application":"?","db.name":"?","db.user":"?","out.host":"?","out.port":"?","sql.query":"?","system.pid":"?"},"metrics":{"_sample_rate":"?","_sampling_priority_v1":"?","_top_level":"?","db.rowcount":"?"},"type":"?","resource_hash":"?","start":"?","end":"?","trace_id":"?","span_id":"?","parent_id":"?","is_root":"?"}],"org_id":"?","host_id":"?","start":"?","end":"?","env":"?","host_groups":["?","?"]} {"index":{"_index":"?","_routing":"?","_type":"?"}} {"trace_id":"?","span":[{"service":"?","name":"?","resource":"?","duration":"?","error":"?","meta":{"db.application":"?","db.name":"?","db.user":"?","out.host":"?","out.port":"?","sql.query":"?","system.pid":"?"},"metrics":{"_sample_rate":"?","_sampling_priority_v1":"?","_top_level":"?","db.rowcount":"?"},"type":"?","resource_hash":"?","start":"?","end":"?","trace_id":"?","span_id":"?","parent_id":"?","is_root":"?"}],"org_id":"?","host_id":"?","start":"?","end":"?","env":"?","host_groups":["?","?"]} {"index":{"_index":"?","_routing":"?","_type":"?"}} {"trace_id":"?","span":[{"service":"?","name":"?","resource":"?","duration":"?","error":"?","meta":{"db.application":"?","db.name":"?","db.user":"?","out.host":"?","out.port":"?","sql.query":"?","system.pid":"?"},"metrics":{"_sample_rate":"?","_sampling_priority_v1":"?","_top_level":"?","db.rowcount":"?"},"type":"?","resource_hash":"?","start":"?","end":"?","trace_id":"?","span_id":"?","parent_id":"?","is_root":"?"}],"org_id":"?","host_id":"?","start":"?","end":"?","env":"?","host_groups":["?","?"]} {"index":{"_index":"?","_routing":"?","_type":"?"}} {"trace_id":"?","span":[{"service":"?","name":"?","resource":"?","duration":"?","error":"?","meta":{"db.application":"?","db.name":"?","db.user":"?","out.host":"?","out.port":"?","sql.query":"?","system.pid":"?"},"metrics":{"_sample_rate":"?","_sampling_priority_v1":"?","_top_level":"?","db.rowcount":"?"},"type":"?","resource_hash":"?","start":"?","end":"?","trace_id":"?","span_id":"?","parent_id":"?","is_root":"?"}],"org_id":"?","host_id":"?","start":"?","end":"?","env":"?","host_groups":["?","?"]} {"index":{"_index":"?","_routing":"?","_type":"?"}} {"trace_id":"?","span":[{"service":"?","name":"?","resource":"?","duration":"?","error":"?","meta":{"db.application":"?","db.name":"?","db.user":"?","out.host":"?","out.port":"?","sql.query":"?","system.pid":"?"},"metrics":{"_sample_rate":"?","_sampling_priority_v1":"?","_top_level":"?","db.rowcount":"?"},"type":"?","resource_hash":"?","start":"?","end":"?"... + + + diff --git a/pkg/trace/osutil/file.go b/pkg/trace/osutil/file.go new file mode 100644 index 0000000000000..2f829e8a6d732 --- /dev/null +++ b/pkg/trace/osutil/file.go @@ -0,0 +1,38 @@ +package osutil + +import ( + "fmt" + "os" + + "github.com/DataDog/datadog-agent/pkg/trace/flags" + log "github.com/cihub/seelog" +) + +// Exists reports whether the given path exists. +func Exists(path string) bool { + _, err := os.Stat(path) + return !os.IsNotExist(err) +} + +// Exit prints the message and exits the program with status code 1. +func Exit(msg string) { + if flags.Info || flags.Version { + fmt.Println(msg) + } else { + log.Error(msg) + log.Flush() + } + os.Exit(1) +} + +// Exitf prints the formatted text and exits the program with status code 1. +func Exitf(format string, args ...interface{}) { + if flags.Info || flags.Version { + fmt.Printf(format, args...) + fmt.Print("") + } else { + log.Errorf(format, args...) + log.Flush() + } + os.Exit(1) +} diff --git a/pkg/trace/pb/decoder.go b/pkg/trace/pb/decoder.go new file mode 100644 index 0000000000000..c8902c50e68b0 --- /dev/null +++ b/pkg/trace/pb/decoder.go @@ -0,0 +1,195 @@ +package pb + +import ( + "errors" + "math" + + "github.com/tinylib/msgp/msgp" +) + +// parseString reads the next type in the msgpack payload and +// converts the BinType or the StrType in a valid string. +func parseString(dc *msgp.Reader) (string, error) { + // read the generic representation type without decoding + t, err := dc.NextType() + if err != nil { + return "", err + } + switch t { + case msgp.BinType: + i, err := dc.ReadBytes(nil) + if err != nil { + return "", err + } + return msgp.UnsafeString(i), nil + case msgp.StrType: + i, err := dc.ReadString() + if err != nil { + return "", err + } + return i, nil + default: + return "", msgp.TypeError{Encoded: t, Method: msgp.StrType} + } +} + +// parseFloat64 parses a float64 even if the sent value is an int64 or an uint64; +// this is required because the encoding library could remove bytes from the encoded +// payload to reduce the size, if they're not needed. +func parseFloat64(dc *msgp.Reader) (float64, error) { + // read the generic representation type without decoding + t, err := dc.NextType() + if err != nil { + return 0, err + } + + switch t { + case msgp.IntType: + i, err := dc.ReadInt64() + if err != nil { + return 0, err + } + + return float64(i), nil + case msgp.UintType: + i, err := dc.ReadUint64() + if err != nil { + return 0, err + } + + return float64(i), nil + case msgp.Float64Type: + f, err := dc.ReadFloat64() + if err != nil { + return 0, err + } + + return f, nil + default: + return 0, msgp.TypeError{Encoded: t, Method: msgp.Float64Type} + } +} + +// cast to int64 values that are int64 but that are sent in uint64 +// over the wire. Set to 0 if they overflow the MaxInt64 size. This +// cast should be used ONLY while decoding int64 values that are +// sent as uint64 to reduce the payload size, otherwise the approach +// is not correct in the general sense. +func castInt64(v uint64) (int64, bool) { + if v > math.MaxInt64 { + return 0, false + } + + return int64(v), true +} + +// parseInt64 parses an int64 even if the sent value is an uint64; +// this is required because the encoding library could remove bytes from the encoded +// payload to reduce the size, if they're not needed. +func parseInt64(dc *msgp.Reader) (int64, error) { + // read the generic representation type without decoding + t, err := dc.NextType() + if err != nil { + return 0, err + } + + switch t { + case msgp.IntType: + i, err := dc.ReadInt64() + if err != nil { + return 0, err + } + return i, nil + case msgp.UintType: + u, err := dc.ReadUint64() + if err != nil { + return 0, err + } + + // force-cast + i, ok := castInt64(u) + if !ok { + return 0, errors.New("found uint64, overflows int64") + } + return i, nil + default: + return 0, msgp.TypeError{Encoded: t, Method: msgp.IntType} + } +} + +// parseUint64 parses an uint64 even if the sent value is an int64; +// this is required because the language used for the encoding library +// may not have unsigned types. An example is early version of Java +// (and so JRuby interpreter) that encodes uint64 as int64: +// http://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html +func parseUint64(dc *msgp.Reader) (uint64, error) { + // read the generic representation type without decoding + t, err := dc.NextType() + if err != nil { + return 0, err + } + + switch t { + case msgp.UintType: + u, err := dc.ReadUint64() + if err != nil { + return 0, err + } + return u, err + case msgp.IntType: + i, err := dc.ReadInt64() + if err != nil { + return 0, err + } + return uint64(i), nil + default: + return 0, msgp.TypeError{Encoded: t, Method: msgp.IntType} + } +} + +// cast to int32 values that are int32 but that are sent in uint32 +// over the wire. Set to 0 if they overflow the MaxInt32 size. This +// cast should be used ONLY while decoding int32 values that are +// sent as uint32 to reduce the payload size, otherwise the approach +// is not correct in the general sense. +func castInt32(v uint32) (int32, bool) { + if v > math.MaxInt32 { + return 0, false + } + + return int32(v), true +} + +// parseInt32 parses an int32 even if the sent value is an uint32; +// this is required because the encoding library could remove bytes from the encoded +// payload to reduce the size, if they're not needed. +func parseInt32(dc *msgp.Reader) (int32, error) { + // read the generic representation type without decoding + t, err := dc.NextType() + if err != nil { + return 0, err + } + + switch t { + case msgp.IntType: + i, err := dc.ReadInt32() + if err != nil { + return 0, err + } + return i, nil + case msgp.UintType: + u, err := dc.ReadUint32() + if err != nil { + return 0, err + } + + // force-cast + i, ok := castInt32(u) + if !ok { + return 0, errors.New("found uint32, overflows int32") + } + return i, nil + default: + return 0, msgp.TypeError{Encoded: t, Method: msgp.IntType} + } +} diff --git a/pkg/trace/pb/decoder_test.go b/pkg/trace/pb/decoder_test.go new file mode 100644 index 0000000000000..c940587ef4b04 --- /dev/null +++ b/pkg/trace/pb/decoder_test.go @@ -0,0 +1,41 @@ +package pb + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/tinylib/msgp/msgp" +) + +func TestParseFloat64(t *testing.T) { + assert := assert.New(t) + + data := []byte{ + 0x2a, // 42 + 0xd1, 0xfb, 0x2e, // -1234 + 0xcd, 0x0a, 0x9b, // 2715 + 0xcb, 0x40, 0x09, 0x1e, 0xb8, 0x51, 0xeb, 0x85, 0x1f, // float64(3.14) + } + + reader := msgp.NewReader(bytes.NewReader(data)) + + var f float64 + var err error + + f, err = parseFloat64(reader) + assert.NoError(err) + assert.Equal(42.0, f) + + f, err = parseFloat64(reader) + assert.NoError(err) + assert.Equal(-1234.0, f) + + f, err = parseFloat64(reader) + assert.NoError(err) + assert.Equal(2715.0, f) + + f, err = parseFloat64(reader) + assert.NoError(err) + assert.Equal(3.14, f) +} diff --git a/pkg/trace/pb/doc.go b/pkg/trace/pb/doc.go new file mode 100644 index 0000000000000..b6a0bb5eedf83 --- /dev/null +++ b/pkg/trace/pb/doc.go @@ -0,0 +1,7 @@ +// Package pb contains the data structures used by the trace agent to communicate +// with tracers and the Datadog API. Note that the "//go:generate" directives from this +// package were removed because the generated files were manually edited to create +// adaptions (see decoder.go). +// +// TODO: eventually move this to https://github.com/DataDog/agent-payload +package pb diff --git a/pkg/trace/pb/services.go b/pkg/trace/pb/services.go new file mode 100644 index 0000000000000..50b01b13175c4 --- /dev/null +++ b/pkg/trace/pb/services.go @@ -0,0 +1,4 @@ +package pb + +// ServicesMetadata is a standard key/val meta map attached to each named service +type ServicesMetadata map[string]map[string]string diff --git a/pkg/trace/pb/services_gen.go b/pkg/trace/pb/services_gen.go new file mode 100644 index 0000000000000..5610bc6e7c7ba --- /dev/null +++ b/pkg/trace/pb/services_gen.go @@ -0,0 +1,107 @@ +package pb + +// NOTE: THIS FILE WAS PRODUCED BY THE +// MSGP CODE GENERATION TOOL (github.com/tinylib/msgp) +// DO NOT EDIT + +import "github.com/tinylib/msgp/msgp" + +// DecodeMsg implements msgp.Decodable +func (z *ServicesMetadata) DecodeMsg(dc *msgp.Reader) (err error) { + var zxhx uint32 + zxhx, err = dc.ReadMapHeader() + if err != nil { + return + } + if (*z) == nil && zxhx > 0 { + (*z) = make(ServicesMetadata, zxhx) + } else if len((*z)) > 0 { + for key, _ := range *z { + delete((*z), key) + } + } + for zxhx > 0 { + zxhx-- + var zajw string + var zwht map[string]string + zajw, err = parseString(dc) + if err != nil { + return + } + var zlqf uint32 + zlqf, err = dc.ReadMapHeader() + if err != nil { + return + } + if zwht == nil && zlqf > 0 { + zwht = make(map[string]string, zlqf) + } else if len(zwht) > 0 { + for key, _ := range zwht { + delete(zwht, key) + } + } + for zlqf > 0 { + zlqf-- + var zhct string + var zcua string + zhct, err = parseString(dc) + if err != nil { + return + } + zcua, err = parseString(dc) + if err != nil { + return + } + zwht[zhct] = zcua + } + (*z)[zajw] = zwht + } + return +} + +// EncodeMsg implements msgp.Encodable +func (z ServicesMetadata) EncodeMsg(en *msgp.Writer) (err error) { + err = en.WriteMapHeader(uint32(len(z))) + if err != nil { + return + } + for zdaf, zpks := range z { + err = en.WriteString(zdaf) + if err != nil { + return + } + err = en.WriteMapHeader(uint32(len(zpks))) + if err != nil { + return + } + for zjfb, zcxo := range zpks { + err = en.WriteString(zjfb) + if err != nil { + return + } + err = en.WriteString(zcxo) + if err != nil { + return + } + } + } + return +} + +// Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message +func (z ServicesMetadata) Msgsize() (s int) { + s = msgp.MapHeaderSize + if z != nil { + for zdaf, zpks := range z { + _ = zpks + s += msgp.StringPrefixSize + len(zdaf) + msgp.MapHeaderSize + if zpks != nil { + for zjfb, zcxo := range zpks { + _ = zcxo + s += msgp.StringPrefixSize + len(zjfb) + msgp.StringPrefixSize + len(zcxo) + } + } + } + } + return +} diff --git a/pkg/trace/pb/services_gen_test.go b/pkg/trace/pb/services_gen_test.go new file mode 100644 index 0000000000000..9c0c22fe1e06a --- /dev/null +++ b/pkg/trace/pb/services_gen_test.go @@ -0,0 +1,67 @@ +package pb + +// NOTE: THIS FILE WAS PRODUCED BY THE +// MSGP CODE GENERATION TOOL (github.com/tinylib/msgp) +// DO NOT EDIT + +import ( + "bytes" + "testing" + + "github.com/tinylib/msgp/msgp" +) + +func TestEncodeDecodeServicesMetadata(t *testing.T) { + v := ServicesMetadata{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + + m := v.Msgsize() + if buf.Len() > m { + t.Logf("WARNING: Msgsize() for %v is inaccurate", v) + } + + vn := ServicesMetadata{} + err := msgp.Decode(&buf, &vn) + if err != nil { + t.Error(err) + } + + buf.Reset() + msgp.Encode(&buf, &v) + err = msgp.NewReader(&buf).Skip() + if err != nil { + t.Error(err) + } +} + +func BenchmarkEncodeServicesMetadata(b *testing.B) { + v := ServicesMetadata{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + b.SetBytes(int64(buf.Len())) + en := msgp.NewWriter(msgp.Nowhere) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + v.EncodeMsg(en) + } + en.Flush() +} + +func BenchmarkDecodeServicesMetadata(b *testing.B) { + v := ServicesMetadata{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + b.SetBytes(int64(buf.Len())) + rd := msgp.NewEndlessReader(buf.Bytes(), b) + dc := msgp.NewReader(rd) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := v.DecodeMsg(dc) + if err != nil { + b.Fatal(err) + } + } +} diff --git a/pkg/trace/pb/span.pb.go b/pkg/trace/pb/span.pb.go new file mode 100644 index 0000000000000..8a8246c45cb7f --- /dev/null +++ b/pkg/trace/pb/span.pb.go @@ -0,0 +1,917 @@ +// Code generated by protoc-gen-gogo. +// source: span.proto +// DO NOT EDIT! + +/* + Package model is a generated protocol buffer package. + + It is generated from these files: + span.proto + trace.proto + trace_payload.proto + + It has these top-level messages: + Span + APITrace + TracePayload +*/ +package pb + +import proto "github.com/gogo/protobuf/proto" +import fmt "fmt" +import math "math" +import _ "github.com/gogo/protobuf/gogoproto" + +import io "io" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package + +type Span struct { + Service string `protobuf:"bytes,1,opt,name=service,proto3" json:"service" msg:"service"` + Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name" msg:"name"` + Resource string `protobuf:"bytes,3,opt,name=resource,proto3" json:"resource" msg:"resource"` + TraceID uint64 `protobuf:"varint,4,opt,name=traceID,proto3" json:"trace_id" msg:"trace_id"` + SpanID uint64 `protobuf:"varint,5,opt,name=spanID,proto3" json:"span_id" msg:"span_id"` + ParentID uint64 `protobuf:"varint,6,opt,name=parentID,proto3" json:"parent_id" msg:"parent_id"` + Start int64 `protobuf:"varint,7,opt,name=start,proto3" json:"start" msg:"start"` + Duration int64 `protobuf:"varint,8,opt,name=duration,proto3" json:"duration" msg:"duration"` + Error int32 `protobuf:"varint,9,opt,name=error,proto3" json:"error" msg:"error"` + Meta map[string]string `protobuf:"bytes,10,rep,name=meta" json:"meta" msg:"meta" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + Metrics map[string]float64 `protobuf:"bytes,11,rep,name=metrics" json:"metrics" msg:"metrics" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"fixed64,2,opt,name=value,proto3"` + Type string `protobuf:"bytes,12,opt,name=type,proto3" json:"type" msg:"type"` +} + +func (m *Span) Reset() { *m = Span{} } +func (m *Span) String() string { return proto.CompactTextString(m) } +func (*Span) ProtoMessage() {} +func (*Span) Descriptor() ([]byte, []int) { return fileDescriptorSpan, []int{0} } + +func (m *Span) GetMeta() map[string]string { + if m != nil { + return m.Meta + } + return nil +} + +func (m *Span) GetMetrics() map[string]float64 { + if m != nil { + return m.Metrics + } + return nil +} + +func init() { + proto.RegisterType((*Span)(nil), "model.Span") +} +func (m *Span) Marshal() (data []byte, err error) { + size := m.Size() + data = make([]byte, size) + n, err := m.MarshalTo(data) + if err != nil { + return nil, err + } + return data[:n], nil +} + +func (m *Span) MarshalTo(data []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if len(m.Service) > 0 { + data[i] = 0xa + i++ + i = encodeVarintSpan(data, i, uint64(len(m.Service))) + i += copy(data[i:], m.Service) + } + if len(m.Name) > 0 { + data[i] = 0x12 + i++ + i = encodeVarintSpan(data, i, uint64(len(m.Name))) + i += copy(data[i:], m.Name) + } + if len(m.Resource) > 0 { + data[i] = 0x1a + i++ + i = encodeVarintSpan(data, i, uint64(len(m.Resource))) + i += copy(data[i:], m.Resource) + } + if m.TraceID != 0 { + data[i] = 0x20 + i++ + i = encodeVarintSpan(data, i, uint64(m.TraceID)) + } + if m.SpanID != 0 { + data[i] = 0x28 + i++ + i = encodeVarintSpan(data, i, uint64(m.SpanID)) + } + if m.ParentID != 0 { + data[i] = 0x30 + i++ + i = encodeVarintSpan(data, i, uint64(m.ParentID)) + } + if m.Start != 0 { + data[i] = 0x38 + i++ + i = encodeVarintSpan(data, i, uint64(m.Start)) + } + if m.Duration != 0 { + data[i] = 0x40 + i++ + i = encodeVarintSpan(data, i, uint64(m.Duration)) + } + if m.Error != 0 { + data[i] = 0x48 + i++ + i = encodeVarintSpan(data, i, uint64(m.Error)) + } + if len(m.Meta) > 0 { + for k, _ := range m.Meta { + data[i] = 0x52 + i++ + v := m.Meta[k] + mapSize := 1 + len(k) + sovSpan(uint64(len(k))) + 1 + len(v) + sovSpan(uint64(len(v))) + i = encodeVarintSpan(data, i, uint64(mapSize)) + data[i] = 0xa + i++ + i = encodeVarintSpan(data, i, uint64(len(k))) + i += copy(data[i:], k) + data[i] = 0x12 + i++ + i = encodeVarintSpan(data, i, uint64(len(v))) + i += copy(data[i:], v) + } + } + if len(m.Metrics) > 0 { + for k, _ := range m.Metrics { + data[i] = 0x5a + i++ + v := m.Metrics[k] + mapSize := 1 + len(k) + sovSpan(uint64(len(k))) + 1 + 8 + i = encodeVarintSpan(data, i, uint64(mapSize)) + data[i] = 0xa + i++ + i = encodeVarintSpan(data, i, uint64(len(k))) + i += copy(data[i:], k) + data[i] = 0x11 + i++ + i = encodeFixed64Span(data, i, uint64(math.Float64bits(float64(v)))) + } + } + if len(m.Type) > 0 { + data[i] = 0x62 + i++ + i = encodeVarintSpan(data, i, uint64(len(m.Type))) + i += copy(data[i:], m.Type) + } + return i, nil +} + +func encodeFixed64Span(data []byte, offset int, v uint64) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + data[offset+4] = uint8(v >> 32) + data[offset+5] = uint8(v >> 40) + data[offset+6] = uint8(v >> 48) + data[offset+7] = uint8(v >> 56) + return offset + 8 +} +func encodeFixed32Span(data []byte, offset int, v uint32) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + return offset + 4 +} +func encodeVarintSpan(data []byte, offset int, v uint64) int { + for v >= 1<<7 { + data[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + data[offset] = uint8(v) + return offset + 1 +} +func (m *Span) Size() (n int) { + var l int + _ = l + l = len(m.Service) + if l > 0 { + n += 1 + l + sovSpan(uint64(l)) + } + l = len(m.Name) + if l > 0 { + n += 1 + l + sovSpan(uint64(l)) + } + l = len(m.Resource) + if l > 0 { + n += 1 + l + sovSpan(uint64(l)) + } + if m.TraceID != 0 { + n += 1 + sovSpan(uint64(m.TraceID)) + } + if m.SpanID != 0 { + n += 1 + sovSpan(uint64(m.SpanID)) + } + if m.ParentID != 0 { + n += 1 + sovSpan(uint64(m.ParentID)) + } + if m.Start != 0 { + n += 1 + sovSpan(uint64(m.Start)) + } + if m.Duration != 0 { + n += 1 + sovSpan(uint64(m.Duration)) + } + if m.Error != 0 { + n += 1 + sovSpan(uint64(m.Error)) + } + if len(m.Meta) > 0 { + for k, v := range m.Meta { + _ = k + _ = v + mapEntrySize := 1 + len(k) + sovSpan(uint64(len(k))) + 1 + len(v) + sovSpan(uint64(len(v))) + n += mapEntrySize + 1 + sovSpan(uint64(mapEntrySize)) + } + } + if len(m.Metrics) > 0 { + for k, v := range m.Metrics { + _ = k + _ = v + mapEntrySize := 1 + len(k) + sovSpan(uint64(len(k))) + 1 + 8 + n += mapEntrySize + 1 + sovSpan(uint64(mapEntrySize)) + } + } + l = len(m.Type) + if l > 0 { + n += 1 + l + sovSpan(uint64(l)) + } + return n +} + +func sovSpan(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} +func sozSpan(x uint64) (n int) { + return sovSpan(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *Span) Unmarshal(data []byte) error { + l := len(data) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: Span: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: Span: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Service", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSpan + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Service = string(data[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSpan + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Name = string(data[iNdEx:postIndex]) + iNdEx = postIndex + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Resource", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSpan + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Resource = string(data[iNdEx:postIndex]) + iNdEx = postIndex + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field TraceID", wireType) + } + m.TraceID = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.TraceID |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field SpanID", wireType) + } + m.SpanID = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.SpanID |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 6: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field ParentID", wireType) + } + m.ParentID = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.ParentID |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Start", wireType) + } + m.Start = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.Start |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 8: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Duration", wireType) + } + m.Duration = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.Duration |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 9: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Error", wireType) + } + m.Error = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.Error |= (int32(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 10: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Meta", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSpan + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + var keykey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + keykey |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + var stringLenmapkey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLenmapkey |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLenmapkey := int(stringLenmapkey) + if intStringLenmapkey < 0 { + return ErrInvalidLengthSpan + } + postStringIndexmapkey := iNdEx + intStringLenmapkey + if postStringIndexmapkey > l { + return io.ErrUnexpectedEOF + } + mapkey := string(data[iNdEx:postStringIndexmapkey]) + iNdEx = postStringIndexmapkey + if m.Meta == nil { + m.Meta = make(map[string]string) + } + if iNdEx < postIndex { + var valuekey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + valuekey |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + var stringLenmapvalue uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLenmapvalue |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLenmapvalue := int(stringLenmapvalue) + if intStringLenmapvalue < 0 { + return ErrInvalidLengthSpan + } + postStringIndexmapvalue := iNdEx + intStringLenmapvalue + if postStringIndexmapvalue > l { + return io.ErrUnexpectedEOF + } + mapvalue := string(data[iNdEx:postStringIndexmapvalue]) + iNdEx = postStringIndexmapvalue + m.Meta[mapkey] = mapvalue + } else { + var mapvalue string + m.Meta[mapkey] = mapvalue + } + iNdEx = postIndex + case 11: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Metrics", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSpan + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + var keykey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + keykey |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + var stringLenmapkey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLenmapkey |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLenmapkey := int(stringLenmapkey) + if intStringLenmapkey < 0 { + return ErrInvalidLengthSpan + } + postStringIndexmapkey := iNdEx + intStringLenmapkey + if postStringIndexmapkey > l { + return io.ErrUnexpectedEOF + } + mapkey := string(data[iNdEx:postStringIndexmapkey]) + iNdEx = postStringIndexmapkey + if m.Metrics == nil { + m.Metrics = make(map[string]float64) + } + if iNdEx < postIndex { + var valuekey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + valuekey |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + var mapvaluetemp uint64 + if (iNdEx + 8) > l { + return io.ErrUnexpectedEOF + } + iNdEx += 8 + mapvaluetemp = uint64(data[iNdEx-8]) + mapvaluetemp |= uint64(data[iNdEx-7]) << 8 + mapvaluetemp |= uint64(data[iNdEx-6]) << 16 + mapvaluetemp |= uint64(data[iNdEx-5]) << 24 + mapvaluetemp |= uint64(data[iNdEx-4]) << 32 + mapvaluetemp |= uint64(data[iNdEx-3]) << 40 + mapvaluetemp |= uint64(data[iNdEx-2]) << 48 + mapvaluetemp |= uint64(data[iNdEx-1]) << 56 + mapvalue := math.Float64frombits(mapvaluetemp) + m.Metrics[mapkey] = mapvalue + } else { + var mapvalue float64 + m.Metrics[mapkey] = mapvalue + } + iNdEx = postIndex + case 12: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSpan + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSpan + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Type = string(data[iNdEx:postIndex]) + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSpan(data[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthSpan + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipSpan(data []byte) (n int, err error) { + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSpan + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSpan + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if data[iNdEx-1] < 0x80 { + break + } + } + return iNdEx, nil + case 1: + iNdEx += 8 + return iNdEx, nil + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSpan + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + iNdEx += length + if length < 0 { + return 0, ErrInvalidLengthSpan + } + return iNdEx, nil + case 3: + for { + var innerWire uint64 + var start int = iNdEx + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSpan + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + innerWire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + innerWireType := int(innerWire & 0x7) + if innerWireType == 4 { + break + } + next, err := skipSpan(data[start:]) + if err != nil { + return 0, err + } + iNdEx = start + next + } + return iNdEx, nil + case 4: + return iNdEx, nil + case 5: + iNdEx += 4 + return iNdEx, nil + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + } + panic("unreachable") +} + +var ( + ErrInvalidLengthSpan = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowSpan = fmt.Errorf("proto: integer overflow") +) + +func init() { proto.RegisterFile("span.proto", fileDescriptorSpan) } + +var fileDescriptorSpan = []byte{ + // 493 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x84, 0x93, 0xcd, 0x8e, 0xd3, 0x30, + 0x10, 0xc7, 0xf1, 0x36, 0xe9, 0x87, 0xbb, 0xc0, 0xca, 0x02, 0x64, 0x55, 0x28, 0x89, 0x7c, 0x8a, + 0x90, 0xc8, 0x4a, 0x80, 0x60, 0x55, 0x71, 0xa1, 0x2a, 0x87, 0x1e, 0x90, 0x90, 0x79, 0x00, 0xe4, + 0xa6, 0xa6, 0x44, 0x6c, 0x3e, 0xe4, 0x38, 0x2b, 0xf5, 0x2d, 0x78, 0x0a, 0x9e, 0x85, 0x23, 0x4f, + 0x10, 0xa1, 0x72, 0xcb, 0xb1, 0x4f, 0x80, 0x3c, 0x4e, 0xcc, 0x8a, 0xcb, 0xde, 0xf2, 0xff, 0xcd, + 0xfc, 0x3d, 0x9e, 0xf1, 0x04, 0xe3, 0xba, 0x12, 0x45, 0x52, 0xa9, 0x52, 0x97, 0xc4, 0xcf, 0xcb, + 0x9d, 0xbc, 0x5e, 0x3c, 0xdf, 0x67, 0xfa, 0x6b, 0xb3, 0x4d, 0xd2, 0x32, 0xbf, 0xdc, 0x97, 0xfb, + 0xf2, 0x12, 0xa2, 0xdb, 0xe6, 0x0b, 0x28, 0x10, 0xf0, 0x65, 0x5d, 0xec, 0xc7, 0x18, 0x7b, 0x9f, + 0x2a, 0x51, 0x90, 0xd7, 0x78, 0x52, 0x4b, 0x75, 0x93, 0xa5, 0x92, 0xa2, 0x08, 0xc5, 0xb3, 0xd5, + 0xd3, 0xae, 0x0d, 0x07, 0x74, 0x6a, 0xc3, 0xfb, 0x79, 0xbd, 0x5f, 0xb2, 0x5e, 0x33, 0x3e, 0x44, + 0xc8, 0x33, 0xec, 0x15, 0x22, 0x97, 0xf4, 0x0c, 0x4c, 0x4f, 0xba, 0x36, 0x04, 0x7d, 0x6a, 0x43, + 0x0c, 0x0e, 0x23, 0x18, 0x07, 0x46, 0x96, 0x78, 0xaa, 0x64, 0x5d, 0x36, 0x2a, 0x95, 0x74, 0x04, + 0xf9, 0x41, 0xd7, 0x86, 0x8e, 0x9d, 0xda, 0xf0, 0x01, 0x78, 0x06, 0xc0, 0xb8, 0x8b, 0x91, 0x2b, + 0x3c, 0xd1, 0x4a, 0xa4, 0x72, 0xb3, 0xa6, 0x5e, 0x84, 0x62, 0xcf, 0x5a, 0x01, 0x7d, 0xce, 0x76, + 0xce, 0x3a, 0x00, 0xc6, 0x87, 0x74, 0xf2, 0x0a, 0x8f, 0xcd, 0x98, 0x36, 0x6b, 0xea, 0x83, 0xd1, + 0x36, 0x56, 0x89, 0xc2, 0xfa, 0xfa, 0xc6, 0xac, 0x66, 0xbc, 0xcf, 0x25, 0x6f, 0xf1, 0xb4, 0x12, + 0x4a, 0x16, 0x7a, 0xb3, 0xa6, 0x63, 0xf0, 0x45, 0x5d, 0x1b, 0xce, 0x2c, 0xb3, 0xce, 0x87, 0xe0, + 0x74, 0x84, 0x71, 0xe7, 0x20, 0x09, 0xf6, 0x6b, 0x2d, 0x94, 0xa6, 0x93, 0x08, 0xc5, 0xa3, 0x15, + 0xed, 0xda, 0xd0, 0x82, 0x53, 0x1b, 0xce, 0x6d, 0x41, 0xa3, 0x18, 0xb7, 0xd4, 0x4c, 0x66, 0xd7, + 0x28, 0xa1, 0xb3, 0xb2, 0xa0, 0x53, 0xb0, 0x40, 0x7b, 0x03, 0x73, 0xed, 0x0d, 0x80, 0x71, 0x17, + 0x33, 0xb5, 0xa4, 0x52, 0xa5, 0xa2, 0xb3, 0x08, 0xc5, 0xbe, 0xad, 0x05, 0xc0, 0xd5, 0x02, 0xc5, + 0xb8, 0xa5, 0xe4, 0x1d, 0xf6, 0x72, 0xa9, 0x05, 0xc5, 0xd1, 0x28, 0x9e, 0xbf, 0x78, 0x9c, 0xc0, + 0xde, 0x24, 0x66, 0x09, 0x92, 0x0f, 0x52, 0x8b, 0xf7, 0x85, 0x56, 0x07, 0xfb, 0x90, 0x26, 0xcd, + 0x3d, 0xa4, 0x11, 0x8c, 0x03, 0x23, 0x1f, 0xf1, 0x24, 0x97, 0x5a, 0x65, 0x69, 0x4d, 0xe7, 0x70, + 0x0a, 0xfd, 0xef, 0x14, 0x13, 0xb2, 0x07, 0xc1, 0xb4, 0xfb, 0x64, 0x37, 0xed, 0x5e, 0x33, 0x3e, + 0x44, 0xcc, 0x1a, 0xe9, 0x43, 0x25, 0xe9, 0xf9, 0xbf, 0x35, 0x32, 0xda, 0x55, 0x37, 0x82, 0x71, + 0x60, 0x8b, 0x37, 0x78, 0xe6, 0x2e, 0x4a, 0x2e, 0xf0, 0xe8, 0x9b, 0x3c, 0xd8, 0x9d, 0xe5, 0xe6, + 0x93, 0x3c, 0xc2, 0xfe, 0x8d, 0xb8, 0x6e, 0xfa, 0x95, 0xe4, 0x56, 0x2c, 0xcf, 0xae, 0xd0, 0x62, + 0x89, 0xcf, 0x6f, 0xdf, 0xed, 0x2e, 0x2f, 0xba, 0xe5, 0x5d, 0x5d, 0xfc, 0x3c, 0x06, 0xe8, 0xd7, + 0x31, 0x40, 0xbf, 0x8f, 0x01, 0xfa, 0xfe, 0x27, 0xb8, 0xb7, 0x1d, 0xc3, 0x1f, 0xf4, 0xf2, 0x6f, + 0x00, 0x00, 0x00, 0xff, 0xff, 0x94, 0x9b, 0x26, 0x0f, 0x85, 0x03, 0x00, 0x00, +} diff --git a/pkg/trace/pb/span.proto b/pkg/trace/pb/span.proto new file mode 100644 index 0000000000000..e435d3dcd7ae7 --- /dev/null +++ b/pkg/trace/pb/span.proto @@ -0,0 +1,20 @@ +syntax = "proto3"; + +package pb; + +import "github.com/gogo/protobuf/gogoproto/gogo.proto"; + +message Span { + string service = 1 [(gogoproto.jsontag) = "service", (gogoproto.moretags) = "msg:\"service\""]; + string name = 2 [(gogoproto.jsontag) = "name", (gogoproto.moretags) = "msg:\"name\""]; + string resource = 3 [(gogoproto.jsontag) = "resource", (gogoproto.moretags) = "msg:\"resource\""]; + uint64 traceID = 4 [(gogoproto.jsontag) = "trace_id", (gogoproto.moretags) = "msg:\"trace_id\""]; + uint64 spanID = 5 [(gogoproto.jsontag) = "span_id", (gogoproto.moretags) = "msg:\"span_id\""]; + uint64 parentID = 6 [(gogoproto.jsontag) = "parent_id", (gogoproto.moretags) = "msg:\"parent_id\""]; + int64 start = 7 [(gogoproto.jsontag) = "start", (gogoproto.moretags) = "msg:\"start\""]; + int64 duration = 8 [(gogoproto.jsontag) = "duration", (gogoproto.moretags) = "msg:\"duration\""]; + int32 error = 9 [(gogoproto.jsontag) = "error", (gogoproto.moretags) = "msg:\"error\""]; + map meta = 10 [(gogoproto.jsontag) = "meta", (gogoproto.moretags) = "msg:\"meta\""]; + map metrics = 11 [(gogoproto.jsontag) = "metrics", (gogoproto.moretags) = "msg:\"metrics\""]; + string type = 12 [(gogoproto.jsontag) = "type", (gogoproto.moretags) = "msg:\"type\""]; +} diff --git a/pkg/trace/pb/span_gen.go b/pkg/trace/pb/span_gen.go new file mode 100644 index 0000000000000..c8d6561546f3b --- /dev/null +++ b/pkg/trace/pb/span_gen.go @@ -0,0 +1,350 @@ +package pb + +import ( + "github.com/tinylib/msgp/msgp" +) + +// DecodeMsg implements msgp.Decodable +func (z *Span) DecodeMsg(dc *msgp.Reader) (err error) { + var field []byte + _ = field + var zajw uint32 + zajw, err = dc.ReadMapHeader() + if err != nil { + return + } + for zajw > 0 { + zajw-- + field, err = dc.ReadMapKeyPtr() + if err != nil { + return + } + + switch msgp.UnsafeString(field) { + case "service": + if dc.IsNil() { + z.Service, err = "", dc.ReadNil() + break + } + + z.Service, err = parseString(dc) + if err != nil { + return + } + case "name": + if dc.IsNil() { + z.Name, err = "", dc.ReadNil() + break + } + + z.Name, err = parseString(dc) + if err != nil { + return + } + case "resource": + if dc.IsNil() { + z.Resource, err = "", dc.ReadNil() + break + } + + z.Resource, err = parseString(dc) + if err != nil { + return + } + case "trace_id": + if dc.IsNil() { + z.TraceID, err = 0, dc.ReadNil() + break + } + + z.TraceID, err = parseUint64(dc) + if err != nil { + return + } + case "span_id": + if dc.IsNil() { + z.SpanID, err = 0, dc.ReadNil() + break + } + + z.SpanID, err = parseUint64(dc) + if err != nil { + return + } + case "start": + if dc.IsNil() { + z.Start, err = 0, dc.ReadNil() + break + } + + z.Start, err = parseInt64(dc) + if err != nil { + return + } + case "duration": + if dc.IsNil() { + z.Duration, err = 0, dc.ReadNil() + break + } + + z.Duration, err = parseInt64(dc) + if err != nil { + return + } + case "error": + if dc.IsNil() { + z.Error, err = 0, dc.ReadNil() + break + } + + z.Error, err = parseInt32(dc) + if err != nil { + return + } + case "meta": + if dc.IsNil() { + z.Meta, err = nil, dc.ReadNil() + break + } + + var zwht uint32 + zwht, err = dc.ReadMapHeader() + if err != nil { + return + } + if z.Meta == nil && zwht > 0 { + z.Meta = make(map[string]string, zwht) + } else if len(z.Meta) > 0 { + for key, _ := range z.Meta { + delete(z.Meta, key) + } + } + for zwht > 0 { + zwht-- + var zxvk string + var zbzg string + zxvk, err = parseString(dc) + if err != nil { + return + } + zbzg, err = parseString(dc) + if err != nil { + return + } + z.Meta[zxvk] = zbzg + } + case "metrics": + if dc.IsNil() { + z.Metrics, err = nil, dc.ReadNil() + break + } + + var zhct uint32 + zhct, err = dc.ReadMapHeader() + if err != nil { + return + } + if z.Metrics == nil && zhct > 0 { + z.Metrics = make(map[string]float64, zhct) + } else if len(z.Metrics) > 0 { + for key, _ := range z.Metrics { + delete(z.Metrics, key) + } + } + for zhct > 0 { + zhct-- + var zbai string + var zcmr float64 + zbai, err = parseString(dc) + if err != nil { + return + } + zcmr, err = parseFloat64(dc) + if err != nil { + return + } + z.Metrics[zbai] = zcmr + } + case "parent_id": + if dc.IsNil() { + z.ParentID, err = 0, dc.ReadNil() + break + } + + z.ParentID, err = parseUint64(dc) + if err != nil { + return + } + case "type": + if dc.IsNil() { + z.Type, err = "", dc.ReadNil() + break + } + + z.Type, err = parseString(dc) + if err != nil { + return + } + default: + err = dc.Skip() + if err != nil { + return + } + } + } + return +} + +// EncodeMsg implements msgp.Encodable +func (z *Span) EncodeMsg(en *msgp.Writer) (err error) { + // map header, size 12 + // write "service" + err = en.Append(0x8c, 0xa7, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65) + if err != nil { + return err + } + err = en.WriteString(z.Service) + if err != nil { + return + } + // write "name" + err = en.Append(0xa4, 0x6e, 0x61, 0x6d, 0x65) + if err != nil { + return err + } + err = en.WriteString(z.Name) + if err != nil { + return + } + // write "resource" + err = en.Append(0xa8, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65) + if err != nil { + return err + } + err = en.WriteString(z.Resource) + if err != nil { + return + } + // write "trace_id" + err = en.Append(0xa8, 0x74, 0x72, 0x61, 0x63, 0x65, 0x5f, 0x69, 0x64) + if err != nil { + return err + } + err = en.WriteUint64(z.TraceID) + if err != nil { + return + } + // write "span_id" + err = en.Append(0xa7, 0x73, 0x70, 0x61, 0x6e, 0x5f, 0x69, 0x64) + if err != nil { + return err + } + err = en.WriteUint64(z.SpanID) + if err != nil { + return + } + // write "start" + err = en.Append(0xa5, 0x73, 0x74, 0x61, 0x72, 0x74) + if err != nil { + return err + } + err = en.WriteInt64(z.Start) + if err != nil { + return + } + // write "duration" + err = en.Append(0xa8, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e) + if err != nil { + return err + } + err = en.WriteInt64(z.Duration) + if err != nil { + return + } + // write "error" + err = en.Append(0xa5, 0x65, 0x72, 0x72, 0x6f, 0x72) + if err != nil { + return err + } + err = en.WriteInt32(z.Error) + if err != nil { + return + } + // write "meta" + err = en.Append(0xa4, 0x6d, 0x65, 0x74, 0x61) + if err != nil { + return err + } + err = en.WriteMapHeader(uint32(len(z.Meta))) + if err != nil { + return + } + for zxvk, zbzg := range z.Meta { + err = en.WriteString(zxvk) + if err != nil { + return + } + err = en.WriteString(zbzg) + if err != nil { + return + } + } + // write "metrics" + err = en.Append(0xa7, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73) + if err != nil { + return err + } + err = en.WriteMapHeader(uint32(len(z.Metrics))) + if err != nil { + return + } + for zbai, zcmr := range z.Metrics { + err = en.WriteString(zbai) + if err != nil { + return + } + err = en.WriteFloat64(zcmr) + if err != nil { + return + } + } + // write "parent_id" + err = en.Append(0xa9, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x5f, 0x69, 0x64) + if err != nil { + return err + } + err = en.WriteUint64(z.ParentID) + if err != nil { + return + } + // write "type" + err = en.Append(0xa4, 0x74, 0x79, 0x70, 0x65) + if err != nil { + return err + } + err = en.WriteString(z.Type) + if err != nil { + return + } + return +} + +// Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message +func (z *Span) Msgsize() (s int) { + s = 1 + 8 + msgp.StringPrefixSize + len(z.Service) + 5 + msgp.StringPrefixSize + len(z.Name) + 9 + msgp.StringPrefixSize + len(z.Resource) + 9 + msgp.Uint64Size + 8 + msgp.Uint64Size + 6 + msgp.Int64Size + 9 + msgp.Int64Size + 6 + msgp.Int32Size + 5 + msgp.MapHeaderSize + if z.Meta != nil { + for zxvk, zbzg := range z.Meta { + _ = zbzg + s += msgp.StringPrefixSize + len(zxvk) + msgp.StringPrefixSize + len(zbzg) + } + } + s += 8 + msgp.MapHeaderSize + if z.Metrics != nil { + for zbai, zcmr := range z.Metrics { + _ = zcmr + s += msgp.StringPrefixSize + len(zbai) + msgp.Float64Size + } + } + s += 10 + msgp.Uint64Size + 5 + msgp.StringPrefixSize + len(z.Type) + return +} diff --git a/pkg/trace/pb/trace.go b/pkg/trace/pb/trace.go new file mode 100644 index 0000000000000..ec77c9cbff414 --- /dev/null +++ b/pkg/trace/pb/trace.go @@ -0,0 +1,7 @@ +package pb + +// Trace is a collection of spans with the same trace ID +type Trace []*Span + +// Traces is a list of traces. This model matters as this is what we unpack from msgp. +type Traces []Trace diff --git a/pkg/trace/pb/trace.pb.go b/pkg/trace/pb/trace.pb.go new file mode 100644 index 0000000000000..babdeaa0ece3e --- /dev/null +++ b/pkg/trace/pb/trace.pb.go @@ -0,0 +1,404 @@ +// Code generated by protoc-gen-gogo. +// source: trace.proto +// DO NOT EDIT! + +package pb + +import proto "github.com/gogo/protobuf/proto" +import fmt "fmt" +import math "math" + +import io "io" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +type APITrace struct { + TraceID uint64 `protobuf:"varint,1,opt,name=traceID,proto3" json:"traceID,omitempty"` + Spans []*Span `protobuf:"bytes,2,rep,name=spans" json:"spans,omitempty"` + StartTime int64 `protobuf:"varint,6,opt,name=startTime,proto3" json:"startTime,omitempty"` + EndTime int64 `protobuf:"varint,7,opt,name=endTime,proto3" json:"endTime,omitempty"` +} + +func (m *APITrace) Reset() { *m = APITrace{} } +func (m *APITrace) String() string { return proto.CompactTextString(m) } +func (*APITrace) ProtoMessage() {} +func (*APITrace) Descriptor() ([]byte, []int) { return fileDescriptorTrace, []int{0} } + +func (m *APITrace) GetSpans() []*Span { + if m != nil { + return m.Spans + } + return nil +} + +func init() { + proto.RegisterType((*APITrace)(nil), "model.APITrace") +} +func (m *APITrace) Marshal() (data []byte, err error) { + size := m.Size() + data = make([]byte, size) + n, err := m.MarshalTo(data) + if err != nil { + return nil, err + } + return data[:n], nil +} + +func (m *APITrace) MarshalTo(data []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if m.TraceID != 0 { + data[i] = 0x8 + i++ + i = encodeVarintTrace(data, i, uint64(m.TraceID)) + } + if len(m.Spans) > 0 { + for _, msg := range m.Spans { + data[i] = 0x12 + i++ + i = encodeVarintTrace(data, i, uint64(msg.Size())) + n, err := msg.MarshalTo(data[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if m.StartTime != 0 { + data[i] = 0x30 + i++ + i = encodeVarintTrace(data, i, uint64(m.StartTime)) + } + if m.EndTime != 0 { + data[i] = 0x38 + i++ + i = encodeVarintTrace(data, i, uint64(m.EndTime)) + } + return i, nil +} + +func encodeFixed64Trace(data []byte, offset int, v uint64) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + data[offset+4] = uint8(v >> 32) + data[offset+5] = uint8(v >> 40) + data[offset+6] = uint8(v >> 48) + data[offset+7] = uint8(v >> 56) + return offset + 8 +} +func encodeFixed32Trace(data []byte, offset int, v uint32) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + return offset + 4 +} +func encodeVarintTrace(data []byte, offset int, v uint64) int { + for v >= 1<<7 { + data[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + data[offset] = uint8(v) + return offset + 1 +} +func (m *APITrace) Size() (n int) { + var l int + _ = l + if m.TraceID != 0 { + n += 1 + sovTrace(uint64(m.TraceID)) + } + if len(m.Spans) > 0 { + for _, e := range m.Spans { + l = e.Size() + n += 1 + l + sovTrace(uint64(l)) + } + } + if m.StartTime != 0 { + n += 1 + sovTrace(uint64(m.StartTime)) + } + if m.EndTime != 0 { + n += 1 + sovTrace(uint64(m.EndTime)) + } + return n +} + +func sovTrace(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} +func sozTrace(x uint64) (n int) { + return sovTrace(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *APITrace) Unmarshal(data []byte) error { + l := len(data) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTrace + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: APITrace: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: APITrace: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field TraceID", wireType) + } + m.TraceID = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTrace + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.TraceID |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Spans", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTrace + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthTrace + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Spans = append(m.Spans, &Span{}) + if err := m.Spans[len(m.Spans)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 6: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field StartTime", wireType) + } + m.StartTime = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTrace + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.StartTime |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field EndTime", wireType) + } + m.EndTime = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTrace + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + m.EndTime |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipTrace(data[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthTrace + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipTrace(data []byte) (n int, err error) { + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTrace + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTrace + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if data[iNdEx-1] < 0x80 { + break + } + } + return iNdEx, nil + case 1: + iNdEx += 8 + return iNdEx, nil + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTrace + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + iNdEx += length + if length < 0 { + return 0, ErrInvalidLengthTrace + } + return iNdEx, nil + case 3: + for { + var innerWire uint64 + var start int = iNdEx + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTrace + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + innerWire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + innerWireType := int(innerWire & 0x7) + if innerWireType == 4 { + break + } + next, err := skipTrace(data[start:]) + if err != nil { + return 0, err + } + iNdEx = start + next + } + return iNdEx, nil + case 4: + return iNdEx, nil + case 5: + iNdEx += 4 + return iNdEx, nil + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + } + panic("unreachable") +} + +var ( + ErrInvalidLengthTrace = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowTrace = fmt.Errorf("proto: integer overflow") +) + +func init() { proto.RegisterFile("trace.proto", fileDescriptorTrace) } + +var fileDescriptorTrace = []byte{ + // 162 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x2e, 0x29, 0x4a, 0x4c, + 0x4e, 0xd5, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0xcd, 0xcd, 0x4f, 0x49, 0xcd, 0x91, 0xe2, + 0x2a, 0x2e, 0x48, 0xcc, 0x83, 0x08, 0x29, 0xd5, 0x73, 0x71, 0x38, 0x06, 0x78, 0x86, 0x80, 0x14, + 0x09, 0x49, 0x70, 0xb1, 0x83, 0x55, 0x7b, 0xba, 0x48, 0x30, 0x2a, 0x30, 0x6a, 0xb0, 0x04, 0xc1, + 0xb8, 0x42, 0x8a, 0x5c, 0xac, 0x20, 0x3d, 0xc5, 0x12, 0x4c, 0x0a, 0xcc, 0x1a, 0xdc, 0x46, 0xdc, + 0x7a, 0x60, 0x83, 0xf4, 0x82, 0x0b, 0x12, 0xf3, 0x82, 0x20, 0x32, 0x42, 0x32, 0x5c, 0x9c, 0xc5, + 0x25, 0x89, 0x45, 0x25, 0x21, 0x99, 0xb9, 0xa9, 0x12, 0x6c, 0x0a, 0x8c, 0x1a, 0xcc, 0x41, 0x08, + 0x01, 0x90, 0xd1, 0xa9, 0x79, 0x29, 0x60, 0x39, 0x76, 0xb0, 0x1c, 0x8c, 0xeb, 0x24, 0x70, 0xe2, + 0x91, 0x1c, 0xe3, 0x85, 0x47, 0x72, 0x8c, 0x0f, 0x1e, 0xc9, 0x31, 0x4e, 0x78, 0x2c, 0xc7, 0x90, + 0xc4, 0x06, 0x76, 0x99, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0xd4, 0xf7, 0x13, 0xf9, 0xbb, 0x00, + 0x00, 0x00, +} diff --git a/pkg/trace/pb/trace.proto b/pkg/trace/pb/trace.proto new file mode 100644 index 0000000000000..cddb138c09799 --- /dev/null +++ b/pkg/trace/pb/trace.proto @@ -0,0 +1,12 @@ +syntax = "proto3"; + +package pb; + +import "span.proto"; + +message APITrace { + uint64 traceID = 1; + repeated Span spans = 2; + int64 startTime = 6; + int64 endTime = 7; +} diff --git a/pkg/trace/pb/trace_gen.go b/pkg/trace/pb/trace_gen.go new file mode 100644 index 0000000000000..2e9e9c838a38b --- /dev/null +++ b/pkg/trace/pb/trace_gen.go @@ -0,0 +1,162 @@ +package pb + +// NOTE: THIS FILE WAS PRODUCED BY THE +// MSGP CODE GENERATION TOOL (github.com/tinylib/msgp) +// DO NOT EDIT + +import ( + "github.com/tinylib/msgp/msgp" +) + +// DecodeMsg implements msgp.Decodable +func (z *Trace) DecodeMsg(dc *msgp.Reader) (err error) { + var xsz uint32 + xsz, err = dc.ReadArrayHeader() + if err != nil { + return + } + if cap((*z)) >= int(xsz) { + (*z) = (*z)[:xsz] + } else { + (*z) = make(Trace, xsz) + } + for bzg := range *z { + if dc.IsNil() { + err = dc.ReadNil() + if err != nil { + return + } + (*z)[bzg] = nil + } else { + if (*z)[bzg] == nil { + (*z)[bzg] = new(Span) + } + err = (*z)[bzg].DecodeMsg(dc) + if err != nil { + return + } + } + } + return +} + +// EncodeMsg implements msgp.Encodable +func (z Trace) EncodeMsg(en *msgp.Writer) (err error) { + err = en.WriteArrayHeader(uint32(len(z))) + if err != nil { + return + } + for bai := range z { + if z[bai] == nil { + err = en.WriteNil() + if err != nil { + return + } + } else { + err = z[bai].EncodeMsg(en) + if err != nil { + return + } + } + } + return +} + +func (z Trace) Msgsize() (s int) { + s = msgp.ArrayHeaderSize + for bai := range z { + if z[bai] == nil { + s += msgp.NilSize + } else { + s += z[bai].Msgsize() + } + } + return +} + +// DecodeMsg implements msgp.Decodable +func (z *Traces) DecodeMsg(dc *msgp.Reader) (err error) { + var xsz uint32 + xsz, err = dc.ReadArrayHeader() + if err != nil { + return + } + if cap((*z)) >= int(xsz) { + (*z) = (*z)[:xsz] + } else { + (*z) = make(Traces, xsz) + } + for wht := range *z { + var xsz uint32 + xsz, err = dc.ReadArrayHeader() + if err != nil { + return + } + if cap((*z)[wht]) >= int(xsz) { + (*z)[wht] = (*z)[wht][:xsz] + } else { + (*z)[wht] = make(Trace, xsz) + } + for hct := range (*z)[wht] { + if dc.IsNil() { + err = dc.ReadNil() + if err != nil { + return + } + (*z)[wht][hct] = nil + } else { + if (*z)[wht][hct] == nil { + (*z)[wht][hct] = new(Span) + } + err = (*z)[wht][hct].DecodeMsg(dc) + if err != nil { + return + } + } + } + } + return +} + +// EncodeMsg implements msgp.Encodable +func (z Traces) EncodeMsg(en *msgp.Writer) (err error) { + err = en.WriteArrayHeader(uint32(len(z))) + if err != nil { + return + } + for cua := range z { + err = en.WriteArrayHeader(uint32(len(z[cua]))) + if err != nil { + return + } + for xhx := range z[cua] { + if z[cua][xhx] == nil { + err = en.WriteNil() + if err != nil { + return + } + } else { + err = z[cua][xhx].EncodeMsg(en) + if err != nil { + return + } + } + } + } + return +} + +func (z Traces) Msgsize() (s int) { + s = msgp.ArrayHeaderSize + for cua := range z { + s += msgp.ArrayHeaderSize + for xhx := range z[cua] { + if z[cua][xhx] == nil { + s += msgp.NilSize + } else { + s += z[cua][xhx].Msgsize() + } + } + } + return +} diff --git a/pkg/trace/pb/trace_gen_test.go b/pkg/trace/pb/trace_gen_test.go new file mode 100644 index 0000000000000..8da88c09b270d --- /dev/null +++ b/pkg/trace/pb/trace_gen_test.go @@ -0,0 +1,122 @@ +package pb + +// NOTE: THIS FILE WAS PRODUCED BY THE +// MSGP CODE GENERATION TOOL (github.com/tinylib/msgp) +// DO NOT EDIT + +import ( + "bytes" + "testing" + + "github.com/tinylib/msgp/msgp" +) + +func TestEncodeDecodeTrace(t *testing.T) { + v := Trace{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + + m := v.Msgsize() + if buf.Len() > m { + t.Logf("WARNING: Msgsize() for %v is inaccurate", v) + } + + vn := Trace{} + err := msgp.Decode(&buf, &vn) + if err != nil { + t.Error(err) + } + + buf.Reset() + msgp.Encode(&buf, &v) + err = msgp.NewReader(&buf).Skip() + if err != nil { + t.Error(err) + } +} + +func BenchmarkEncodeTrace(b *testing.B) { + v := Trace{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + b.SetBytes(int64(buf.Len())) + en := msgp.NewWriter(msgp.Nowhere) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + v.EncodeMsg(en) + } + en.Flush() +} + +func BenchmarkDecodeTrace(b *testing.B) { + v := Trace{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + b.SetBytes(int64(buf.Len())) + rd := msgp.NewEndlessReader(buf.Bytes(), b) + dc := msgp.NewReader(rd) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := v.DecodeMsg(dc) + if err != nil { + b.Fatal(err) + } + } +} + +func TestEncodeDecodeTraces(t *testing.T) { + v := Traces{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + + m := v.Msgsize() + if buf.Len() > m { + t.Logf("WARNING: Msgsize() for %v is inaccurate", v) + } + + vn := Traces{} + err := msgp.Decode(&buf, &vn) + if err != nil { + t.Error(err) + } + + buf.Reset() + msgp.Encode(&buf, &v) + err = msgp.NewReader(&buf).Skip() + if err != nil { + t.Error(err) + } +} + +func BenchmarkEncodeTraces(b *testing.B) { + v := Traces{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + b.SetBytes(int64(buf.Len())) + en := msgp.NewWriter(msgp.Nowhere) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + v.EncodeMsg(en) + } + en.Flush() +} + +func BenchmarkDecodeTraces(b *testing.B) { + v := Traces{} + var buf bytes.Buffer + msgp.Encode(&buf, &v) + b.SetBytes(int64(buf.Len())) + rd := msgp.NewEndlessReader(buf.Bytes(), b) + dc := msgp.NewReader(rd) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := v.DecodeMsg(dc) + if err != nil { + b.Fatal(err) + } + } +} diff --git a/pkg/trace/pb/trace_payload.pb.go b/pkg/trace/pb/trace_payload.pb.go new file mode 100644 index 0000000000000..b8f17c37e446a --- /dev/null +++ b/pkg/trace/pb/trace_payload.pb.go @@ -0,0 +1,458 @@ +// Code generated by protoc-gen-gogo. +// source: trace_payload.proto +// DO NOT EDIT! + +package pb + +import proto "github.com/gogo/protobuf/proto" +import fmt "fmt" +import math "math" + +import io "io" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +type TracePayload struct { + HostName string `protobuf:"bytes,1,opt,name=hostName,proto3" json:"hostName,omitempty"` + Env string `protobuf:"bytes,2,opt,name=env,proto3" json:"env,omitempty"` + Traces []*APITrace `protobuf:"bytes,3,rep,name=traces" json:"traces,omitempty"` + Transactions []*Span `protobuf:"bytes,4,rep,name=transactions" json:"transactions,omitempty"` +} + +func (m *TracePayload) Reset() { *m = TracePayload{} } +func (m *TracePayload) String() string { return proto.CompactTextString(m) } +func (*TracePayload) ProtoMessage() {} +func (*TracePayload) Descriptor() ([]byte, []int) { return fileDescriptorTracePayload, []int{0} } + +func (m *TracePayload) GetTraces() []*APITrace { + if m != nil { + return m.Traces + } + return nil +} + +func (m *TracePayload) GetTransactions() []*Span { + if m != nil { + return m.Transactions + } + return nil +} + +func init() { + proto.RegisterType((*TracePayload)(nil), "model.TracePayload") +} +func (m *TracePayload) Marshal() (data []byte, err error) { + size := m.Size() + data = make([]byte, size) + n, err := m.MarshalTo(data) + if err != nil { + return nil, err + } + return data[:n], nil +} + +func (m *TracePayload) MarshalTo(data []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if len(m.HostName) > 0 { + data[i] = 0xa + i++ + i = encodeVarintTracePayload(data, i, uint64(len(m.HostName))) + i += copy(data[i:], m.HostName) + } + if len(m.Env) > 0 { + data[i] = 0x12 + i++ + i = encodeVarintTracePayload(data, i, uint64(len(m.Env))) + i += copy(data[i:], m.Env) + } + if len(m.Traces) > 0 { + for _, msg := range m.Traces { + data[i] = 0x1a + i++ + i = encodeVarintTracePayload(data, i, uint64(msg.Size())) + n, err := msg.MarshalTo(data[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if len(m.Transactions) > 0 { + for _, msg := range m.Transactions { + data[i] = 0x22 + i++ + i = encodeVarintTracePayload(data, i, uint64(msg.Size())) + n, err := msg.MarshalTo(data[i:]) + if err != nil { + return 0, err + } + i += n + } + } + return i, nil +} + +func encodeFixed64TracePayload(data []byte, offset int, v uint64) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + data[offset+4] = uint8(v >> 32) + data[offset+5] = uint8(v >> 40) + data[offset+6] = uint8(v >> 48) + data[offset+7] = uint8(v >> 56) + return offset + 8 +} +func encodeFixed32TracePayload(data []byte, offset int, v uint32) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + return offset + 4 +} +func encodeVarintTracePayload(data []byte, offset int, v uint64) int { + for v >= 1<<7 { + data[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + data[offset] = uint8(v) + return offset + 1 +} +func (m *TracePayload) Size() (n int) { + var l int + _ = l + l = len(m.HostName) + if l > 0 { + n += 1 + l + sovTracePayload(uint64(l)) + } + l = len(m.Env) + if l > 0 { + n += 1 + l + sovTracePayload(uint64(l)) + } + if len(m.Traces) > 0 { + for _, e := range m.Traces { + l = e.Size() + n += 1 + l + sovTracePayload(uint64(l)) + } + } + if len(m.Transactions) > 0 { + for _, e := range m.Transactions { + l = e.Size() + n += 1 + l + sovTracePayload(uint64(l)) + } + } + return n +} + +func sovTracePayload(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} +func sozTracePayload(x uint64) (n int) { + return sovTracePayload(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *TracePayload) Unmarshal(data []byte) error { + l := len(data) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTracePayload + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: TracePayload: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: TracePayload: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field HostName", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTracePayload + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthTracePayload + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.HostName = string(data[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Env", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTracePayload + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthTracePayload + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Env = string(data[iNdEx:postIndex]) + iNdEx = postIndex + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Traces", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTracePayload + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthTracePayload + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Traces = append(m.Traces, &APITrace{}) + if err := m.Traces[len(m.Traces)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Transactions", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTracePayload + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthTracePayload + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Transactions = append(m.Transactions, &Span{}) + if err := m.Transactions[len(m.Transactions)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipTracePayload(data[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthTracePayload + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipTracePayload(data []byte) (n int, err error) { + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTracePayload + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTracePayload + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if data[iNdEx-1] < 0x80 { + break + } + } + return iNdEx, nil + case 1: + iNdEx += 8 + return iNdEx, nil + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTracePayload + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + iNdEx += length + if length < 0 { + return 0, ErrInvalidLengthTracePayload + } + return iNdEx, nil + case 3: + for { + var innerWire uint64 + var start int = iNdEx + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowTracePayload + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + innerWire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + innerWireType := int(innerWire & 0x7) + if innerWireType == 4 { + break + } + next, err := skipTracePayload(data[start:]) + if err != nil { + return 0, err + } + iNdEx = start + next + } + return iNdEx, nil + case 4: + return iNdEx, nil + case 5: + iNdEx += 4 + return iNdEx, nil + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + } + panic("unreachable") +} + +var ( + ErrInvalidLengthTracePayload = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowTracePayload = fmt.Errorf("proto: integer overflow") +) + +func init() { proto.RegisterFile("trace_payload.proto", fileDescriptorTracePayload) } + +var fileDescriptorTracePayload = []byte{ + // 192 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0x2e, 0x29, 0x4a, 0x4c, + 0x4e, 0x8d, 0x2f, 0x48, 0xac, 0xcc, 0xc9, 0x4f, 0x4c, 0xd1, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, + 0x62, 0xcd, 0xcd, 0x4f, 0x49, 0xcd, 0x91, 0xe2, 0x06, 0xcb, 0x41, 0xc4, 0xa4, 0xb8, 0x8a, 0x0b, + 0x12, 0xf3, 0x20, 0x6c, 0xa5, 0x69, 0x8c, 0x5c, 0x3c, 0x21, 0x20, 0xb9, 0x00, 0x88, 0x36, 0x21, + 0x29, 0x2e, 0x8e, 0x8c, 0xfc, 0xe2, 0x12, 0xbf, 0xc4, 0xdc, 0x54, 0x09, 0x46, 0x05, 0x46, 0x0d, + 0xce, 0x20, 0x38, 0x5f, 0x48, 0x80, 0x8b, 0x39, 0x35, 0xaf, 0x4c, 0x82, 0x09, 0x2c, 0x0c, 0x62, + 0x0a, 0xa9, 0x73, 0xb1, 0x81, 0x4d, 0x2e, 0x96, 0x60, 0x56, 0x60, 0xd6, 0xe0, 0x36, 0xe2, 0xd7, + 0x03, 0xdb, 0xa7, 0xe7, 0x18, 0xe0, 0x09, 0x36, 0x35, 0x08, 0x2a, 0x2d, 0xa4, 0xcf, 0xc5, 0x53, + 0x52, 0x94, 0x98, 0x57, 0x9c, 0x98, 0x5c, 0x92, 0x99, 0x9f, 0x57, 0x2c, 0xc1, 0x02, 0x56, 0xce, + 0x0d, 0x55, 0x1e, 0x5c, 0x90, 0x98, 0x17, 0x84, 0xa2, 0xc0, 0x49, 0xe0, 0xc4, 0x23, 0x39, 0xc6, + 0x0b, 0x8f, 0xe4, 0x18, 0x1f, 0x3c, 0x92, 0x63, 0x9c, 0xf0, 0x58, 0x8e, 0x21, 0x89, 0x0d, 0xec, + 0x62, 0x63, 0x40, 0x00, 0x00, 0x00, 0xff, 0xff, 0x5f, 0xc9, 0x04, 0x9e, 0xe8, 0x00, 0x00, 0x00, +} diff --git a/pkg/trace/pb/trace_payload.proto b/pkg/trace/pb/trace_payload.proto new file mode 100644 index 0000000000000..3c7f334d98bd1 --- /dev/null +++ b/pkg/trace/pb/trace_payload.proto @@ -0,0 +1,13 @@ +syntax = "proto3"; + +package pb; + +import "trace.proto"; +import "span.proto"; + +message TracePayload { + string hostName = 1; + string env = 2; + repeated APITrace traces = 3; + repeated Span transactions = 4; +} diff --git a/pkg/trace/quantile/README.md b/pkg/trace/quantile/README.md new file mode 100644 index 0000000000000..fa87f8d65a482 --- /dev/null +++ b/pkg/trace/quantile/README.md @@ -0,0 +1,14 @@ +Sketches +-------- + +Papers: + +- [GK Quantiles](http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf) +- [An Experimental Study of Distributed Quantile Estimation](http://arxiv.org/pdf/1508.05710.pdf) +- [Mergeable Summaries](https://www.cs.utah.edu/~jeffp/papers/merge-summ.pdf) +- [Almost Optimal Streaming Quantiles Algorithms](http://arxiv.org/abs/1603.05346) +- [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf) + +Blogs: + +- [Streaming Approximate Histograms in Go](https://www.vividcortex.com/blog/2013/07/08/streaming-approximate-histograms/) diff --git a/pkg/trace/quantile/summary.go b/pkg/trace/quantile/summary.go new file mode 100644 index 0000000000000..b5d21de362fc9 --- /dev/null +++ b/pkg/trace/quantile/summary.go @@ -0,0 +1,239 @@ +// Package quantile implements "Space-Efficient Online Computation of Quantile +// Summaries" (Greenwald, Khanna 2001): +// http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf +// +// This implementation is backed by a skiplist to make inserting elements into the +// summary faster. Querying is still O(n). +package quantile + +import ( + "bytes" + "fmt" + "sort" +) + +// EPSILON is the precision of the rank returned by our quantile queries +const EPSILON float64 = 0.01 + +// SliceSummary is a GK-summary with a slice backend +type SliceSummary struct { + Entries []Entry + N int +} + +// Entry is an element of the skiplist, see GK paper for description +type Entry struct { + V float64 `json:"v"` + G int `json:"g"` + Delta int `json:"delta"` +} + +// NewSliceSummary allocates a new GK summary backed by a DLL +func NewSliceSummary() *SliceSummary { + return &SliceSummary{} +} + +func (s SliceSummary) String() string { + var b bytes.Buffer + b.WriteString("summary size: ") + b.WriteString(fmt.Sprintf("%d", s.N)) + b.WriteRune('\n') + + gsum := 0 + + for i, e := range s.Entries { + gsum += e.G + b.WriteString(fmt.Sprintf("v:%6.02f g:%05d d:%05d rmin:%05d rmax: %05d ", e.V, e.G, e.Delta, gsum, gsum+e.Delta)) + if i%3 == 2 { + b.WriteRune('\n') + } + } + + return b.String() +} + +// Insert inserts a new value v in the summary paired with t (the ID of the span it was reported from) +func (s *SliceSummary) Insert(v float64, t uint64) { + newEntry := Entry{ + V: v, + G: 1, + Delta: int(2 * EPSILON * float64(s.N)), + } + + i := sort.Search(len(s.Entries), func(i int) bool { return v < s.Entries[i].V }) + + if i == 0 || i == len(s.Entries) { + newEntry.Delta = 0 + } + + // allocate one more + s.Entries = append(s.Entries, Entry{}) + copy(s.Entries[i+1:], s.Entries[i:]) + s.Entries[i] = newEntry + s.N++ + + if s.N%int(1.0/float64(2.0*EPSILON)) == 0 { + s.compress() + } +} + +func (s *SliceSummary) compress() { + epsN := int(2 * EPSILON * float64(s.N)) + + var j, sum int + for i := len(s.Entries) - 1; i >= 2; i = j - 1 { + j = i - 1 + sum = s.Entries[j].G + + for j >= 1 && sum+s.Entries[i].G+s.Entries[i].Delta < epsN { + j-- + sum += s.Entries[j].G + } + sum -= s.Entries[j].G + j++ + + if j < i { + s.Entries[j].V = s.Entries[i].V + s.Entries[j].G = sum + s.Entries[i].G + s.Entries[j].Delta = s.Entries[i].Delta + // copy the rest + copy(s.Entries[j+1:], s.Entries[i+1:]) + // truncate to the numbers of removed elements + s.Entries = s.Entries[:len(s.Entries)-(i-j)] + } + } +} + +// Quantile returns an EPSILON estimate of the element at quantile 'q' (0 <= q <= 1) +func (s *SliceSummary) Quantile(q float64) float64 { + if len(s.Entries) == 0 { + return 0 + } + + // convert quantile to rank + r := int(q*float64(s.N) + 0.5) + + var rmin int + epsN := int(EPSILON * float64(s.N)) + + for i := 0; i < len(s.Entries)-1; i++ { + t := s.Entries[i] + n := s.Entries[i+1] + + rmin += t.G + + if r+epsN < rmin+n.G+n.Delta { + if r+epsN < rmin+n.G { + return t.V + } + return n.V + } + } + + return s.Entries[len(s.Entries)-1].V +} + +// Merge two summaries entries together +func (s *SliceSummary) Merge(s2 *SliceSummary) { + if s2.N == 0 { + return + } + if s.N == 0 { + s.N = s2.N + s.Entries = make([]Entry, 0, len(s2.Entries)) + s.Entries = append(s.Entries, s2.Entries...) + return + } + + pos := 0 + end := len(s.Entries) - 1 + + empties := make([]Entry, len(s2.Entries)) + s.Entries = append(s.Entries, empties...) + + for _, e := range s2.Entries { + for pos <= end { + if e.V > s.Entries[pos].V { + pos++ + continue + } + copy(s.Entries[pos+1:end+2], s.Entries[pos:end+1]) + s.Entries[pos] = e + pos++ + end++ + break + } + if pos > end { + s.Entries[pos] = e + pos++ + } + } + s.N += s2.N + + s.compress() +} + +// Copy allocates a new summary with the same data +func (s *SliceSummary) Copy() *SliceSummary { + s2 := NewSliceSummary() + s2.Entries = make([]Entry, len(s.Entries)) + copy(s2.Entries, s.Entries) + s2.N = s.N + return s2 +} + +// SummarySlice reprensents how many values are in a [Start, End] range +type SummarySlice struct { + Start float64 + End float64 + Weight int +} + +// BySlices returns a slice of Summary slices that represents weighted ranges of +// values +// e.g. [0, 1] : 3 +// [1, 23] : 12 ... +// The number of intervals is related to the precision kept in the internal +// data structure to ensure epsilon*s.N precision on quantiles, but it's bounded. +// When the bounds of the interval are equal, the weight is the number of times +// that exact value was inserted in the summary. +func (s *SliceSummary) BySlices() []SummarySlice { + var slices []SummarySlice + + if len(s.Entries) == 0 { + return slices + } + + // by def in GK first val is always the min + fs := SummarySlice{ + Start: s.Entries[0].V, + End: s.Entries[0].V, + Weight: 1, + } + slices = append(slices, fs) + + last := fs.End + + for _, cur := range s.Entries[1:] { + lastSlice := &slices[len(slices)-1] + if cur.V == lastSlice.Start && cur.V == lastSlice.End { + lastSlice.Weight += cur.G + continue + } + + if cur.G == 1 { + last = cur.V + } + + ss := SummarySlice{ + Start: last, + End: cur.V, + Weight: cur.G, + } + slices = append(slices, ss) + + last = cur.V + } + + return slices +} diff --git a/pkg/trace/quantile/summary_bench_test.go b/pkg/trace/quantile/summary_bench_test.go new file mode 100644 index 0000000000000..ab9e2e1f88ed8 --- /dev/null +++ b/pkg/trace/quantile/summary_bench_test.go @@ -0,0 +1,104 @@ +package quantile + +import ( + "encoding/json" + "math/rand" + "testing" +) + +const randlen = 1000 + +func randSlice(n int) []float64 { + // use those + vals := make([]float64, 0, randlen) + for i := 0; i < n; i++ { + vals = append(vals, rand.Float64()*100000) + } + + return vals +} + +func BenchmarkGKSliceInsertion(b *testing.B) { + s := NewSliceSummary() + + vals := randSlice(randlen) + + b.ResetTimer() + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + s.Insert(vals[n%randlen], uint64(n)) + } +} + +func BenchmarkGKSliceInsertionPreallocd(b *testing.B) { + s := NewSliceSummary() + s.Entries = make([]Entry, 0, 100) + + vals := randSlice(randlen) + + b.ResetTimer() + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + s.Insert(vals[n%randlen], uint64(n)) + } +} + +func BGKSliceQuantiles(b *testing.B, n int) { + s := NewSliceSummary() + vals := randSlice(n) + for i := 0; i < n; i++ { + s.Insert(vals[i], uint64(i)) + } + + b.ResetTimer() + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + s.Quantile(rand.Float64()) + } +} +func BenchmarkGKSliceQuantiles10(b *testing.B) { + BGKSliceQuantiles(b, 10) +} +func BenchmarkGKSliceQuantiles100(b *testing.B) { + BGKSliceQuantiles(b, 100) +} +func BenchmarkGKSliceQuantiles1000(b *testing.B) { + BGKSliceQuantiles(b, 1000) +} +func BenchmarkGKSliceQuantiles10000(b *testing.B) { + BGKSliceQuantiles(b, 10000) +} +func BenchmarkGKSliceQuantiles100000(b *testing.B) { + BGKSliceQuantiles(b, 100000) +} + +func BGKSliceEncoding(b *testing.B, n int) { + s := NewSliceSummary() + vals := randSlice(n) + for i := 0; i < n; i++ { + s.Insert(vals[i], uint64(i)) + } + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + blob, _ := json.Marshal(&s) + var ss SliceSummary + json.Unmarshal(blob, &ss) + } +} +func BenchmarkGKSliceEncoding10(b *testing.B) { + BGKSliceEncoding(b, 10) +} +func BenchmarkGKSliceEncoding100(b *testing.B) { + BGKSliceEncoding(b, 100) +} + +// not worth encoding larger as we're constant in mem +func BenchmarkGKSliceEncoding1000(b *testing.B) { + BGKSliceEncoding(b, 1000) +} diff --git a/pkg/trace/quantile/summary_test.go b/pkg/trace/quantile/summary_test.go new file mode 100644 index 0000000000000..aae34c440b7b8 --- /dev/null +++ b/pkg/trace/quantile/summary_test.go @@ -0,0 +1,195 @@ +package quantile + +import ( + "fmt" + "math" + "testing" + + "github.com/stretchr/testify/assert" +) + +/************************************************************************************ + DATA VALIDATION, with different strategies make sure of the correctness of + our epsilon-approximate quantiles +************************************************************************************/ + +var testQuantiles = []float64{0, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99, 0.999, 0.9999, 1} + +func GenSummarySlice(n int, gen func(i int) float64) []float64 { + s := NewSliceSummary() + + for i := 0; i < n; i++ { + s.Insert(gen(i), uint64(i)) + } + + vals := make([]float64, 0, len(testQuantiles)) + for _, q := range testQuantiles { + val := s.Quantile(q) + vals = append(vals, val) + } + + return vals +} + +/* CONSTANT STREAMS + The most simple checker +*/ +func ConstantGenerator(i int) float64 { + return 42 +} +func SummarySliceConstantN(t *testing.T, n int) { + assert := assert.New(t) + vals := GenSummarySlice(n, ConstantGenerator) + for _, v := range vals { + assert.Equal(42.0, v) + } +} +func TestSummarySliceConstant10(t *testing.T) { + SummarySliceConstantN(t, 10) +} +func TestSummarySliceConstant100(t *testing.T) { + SummarySliceConstantN(t, 100) +} +func TestSummarySliceConstant1000(t *testing.T) { + SummarySliceConstantN(t, 1000) +} +func TestSummarySliceConstant10000(t *testing.T) { + SummarySliceConstantN(t, 10000) +} +func TestSummarySliceConstant100000(t *testing.T) { + SummarySliceConstantN(t, 100000) +} + +/* uniform distribution + expected quantiles are easily to compute as the value == its rank + 1 to i +*/ +func UniformGenerator(i int) float64 { + return float64(i) +} +func SummarySliceUniformN(t *testing.T, n int) { + assert := assert.New(t) + vals := GenSummarySlice(n, UniformGenerator) + + for i, v := range vals { + var exp float64 + if testQuantiles[i] == 0 { + exp = 0 + } else if testQuantiles[i] == 1 { + exp = float64(n) - 1 + } else { + rank := math.Ceil(testQuantiles[i] * float64(n)) + exp = rank - 1 + } + assert.InDelta(exp, v, EPSILON*float64(n), "quantile %f failed, exp: %f, val: %f", testQuantiles[i], exp, v) + } +} +func TestSummarySliceUniform10(t *testing.T) { + SummarySliceUniformN(t, 10) +} +func TestSummarySliceUniform100(t *testing.T) { + SummarySliceUniformN(t, 100) +} +func TestSummarySliceUniform1000(t *testing.T) { + SummarySliceUniformN(t, 1000) +} +func TestSummarySliceUniform10000(t *testing.T) { + SummarySliceUniformN(t, 10000) +} +func TestSummarySliceUniform100000(t *testing.T) { + SummarySliceUniformN(t, 100000) +} + +func TestSummarySliceMerge(t *testing.T) { + assert := assert.New(t) + s1 := NewSliceSummary() + for i := 0; i < 101; i++ { + s1.Insert(float64(i), uint64(i)) + } + + s2 := NewSliceSummary() + for i := 0; i < 50; i++ { + s2.Insert(float64(i), uint64(i)) + } + + s1.Merge(s2) + + expected := map[float64]float64{ + 0.0: 0, + 0.2: 15, + 0.4: 30, + 0.6: 45, + 0.8: 70, + 1.0: 100, + } + + for q, e := range expected { + v := s1.Quantile(q) + assert.Equal(e, v) + } +} + +func TestSliceSummaryRemergeReal10000(t *testing.T) { + s := NewSliceSummary() + for n := 0; n < 10000; n++ { + s1 := NewSliceSummary() + for i := 0; i < 100; i++ { + s1.Insert(float64(i), uint64(i)) + } + s.Merge(s1) + + } + + fmt.Println(s) + slices := s.BySlices() + fmt.Println(slices) + total := 0 + for _, s := range slices { + total += s.Weight + } + fmt.Println(total) +} + +func TestSliceSummaryRemerge10000(t *testing.T) { + s1 := NewSliceSummary() + for n := 0; n < 1000; n++ { + for i := 0; i < 100; i++ { + s1.Insert(float64(i), uint64(i)) + } + + // fmt.Println(s1) + } + + fmt.Println(s1) + slices := s1.BySlices() + fmt.Println(slices) + total := 0 + for _, s := range slices { + total += s.Weight + } + fmt.Println(total) +} + +func TestSummaryBySlices(t *testing.T) { + assert := assert.New(t) + + s := NewSliceSummary() + for i := 1; i < 11; i++ { + s.Insert(float64(i), uint64(i)) + } + s.Insert(float64(5), uint64(42)) + s.Insert(float64(5), uint64(53)) + + slices := s.BySlices() + fmt.Println(slices) + assert.Equal(10, len(slices)) + for i, sl := range slices { + assert.Equal(float64(i+1), sl.Start) + assert.Equal(float64(i+1), sl.End) + if i == 4 { + assert.Equal(3, sl.Weight) + } else { + assert.Equal(1, sl.Weight) + } + } +} diff --git a/pkg/trace/quantile/weighted.go b/pkg/trace/quantile/weighted.go new file mode 100644 index 0000000000000..7797da741c512 --- /dev/null +++ b/pkg/trace/quantile/weighted.go @@ -0,0 +1,84 @@ +package quantile + +import ( + "math" + "math/rand" +) + +var randomFloats []float64 + +func init() { + // generate a list of guaranteed random numbers for the probabilistic round + randomFloats = make([]float64, 100) + r := rand.New(rand.NewSource(7337)) + for i := 0; i < 100; i++ { + randomFloats[i] = r.Float64() + } +} + +// WeightedSliceSummary associates a weight to a slice summary. +type WeightedSliceSummary struct { + Weight float64 + *SliceSummary +} + +func probabilisticRound(g int, weight float64, randFloat func() float64) int { + raw := weight * float64(g) + decimal := raw - math.Floor(raw) + limit := randFloat() + + iraw := int(raw) + if limit > decimal { + iraw++ + } + + return iraw +} + +// WeighSummary applies a weight factor to a slice summary and return it as a +// new slice. +func WeighSummary(s *SliceSummary, weight float64) *SliceSummary { + // Deterministic random number generation based on a list because rand.Seed + // is expensive to run + i := 0 + randFloat := func() float64 { + i++ + return randomFloats[i%len(randomFloats)] + } + + sw := NewSliceSummary() + sw.Entries = make([]Entry, 0, len(s.Entries)) + + gsum := 0 + for _, e := range s.Entries { + newg := probabilisticRound(e.G, weight, randFloat) + // if an entry is down to 0 delete it + if newg != 0 { + sw.Entries = append(sw.Entries, + Entry{V: e.V, G: newg, Delta: e.Delta}, + ) + gsum += newg + } + } + + sw.N = gsum + return sw +} + +// BySlicesWeighted BySlices() is the BySlices version but combines multiple +// weighted slice summaries before returning the histogram +func BySlicesWeighted(summaries ...WeightedSliceSummary) []SummarySlice { + if len(summaries) == 0 { + return []SummarySlice{} + } + + mergeSummary := WeighSummary(summaries[0].SliceSummary, summaries[0].Weight) + if len(summaries) > 1 { + for _, s := range summaries[1:] { + sw := WeighSummary(s.SliceSummary, s.Weight) + mergeSummary.Merge(sw) + } + } + + return mergeSummary.BySlices() +} diff --git a/pkg/trace/quantile/weighted_test.go b/pkg/trace/quantile/weighted_test.go new file mode 100644 index 0000000000000..1ccabf28befd7 --- /dev/null +++ b/pkg/trace/quantile/weighted_test.go @@ -0,0 +1,93 @@ +package quantile + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBySlicesWeightedHalf(t *testing.T) { + s := NewSliceSummary() + for i := 0; i < 100000; i++ { + s.Insert(float64(i%10000), 0) + } + + s2 := NewSliceSummary() + for i := 0; i < 100000; i++ { + s2.Insert(float64(i%10000), 0) + } + + sw1 := WeightedSliceSummary{1.0, s} + sw2 := WeightedSliceSummary{0.5, s2} + + ss := BySlicesWeighted(sw1, sw2) + + // deviation = (num of sum merged = 2) deviation * GK-dev (eps * N) + deviation := 2 * EPSILON * (100000 + 50000) + total := 0 + for _, sl := range ss { + total += sl.Weight + // corner case - tolerate + if sl.Weight == 1 { + continue + } + + expected := int(float64(sl.End-sl.Start) * 15) + require.InDelta(t, expected, sl.Weight, deviation, + "slice [%.0f;%.0f] = %d failed assertion for slices %v", + sl.Start, sl.End, sl.Weight, ss, + ) + + } + require.InDelta(t, 150000, total, deviation, "summaries totals do not match %v", ss) +} + +func TestBySlicesWeightedSingle(t *testing.T) { + s := NewSliceSummary() + for i := 0; i < 1000000; i++ { + s.Insert(float64(i), 0) + } + + sw := WeightedSliceSummary{0.1, s} + ss := BySlicesWeighted(sw) + + // deviation = deviation * GK-dev (eps * N) + deviation := EPSILON * 1000000 + + total := 0 + for _, sl := range ss { + total += sl.Weight + // if the entry is alone this is ok + if sl.Weight == 1 { + continue + } + + expected := int(float64(sl.End-sl.Start) * 0.1) + require.InDelta(t, expected, sl.Weight, deviation, + "slice [%.0f;%.0f] = %d failed assertion for slices %v", + sl.Start, sl.End, sl.Weight, ss, + ) + + } + require.InDelta(t, 100000, total, deviation, "summaries totals do not match %v", ss) +} + +func TestBySlicesWeightedSmall(t *testing.T) { + s := NewSliceSummary() + for i := 0; i < 10; i++ { + s.Insert(float64(i), 0) + } + + sw := WeightedSliceSummary{0.5, s} + ss := BySlicesWeighted(sw) + + // should have ~5 elements probabilistically chosen + fmt.Println(ss) +} + +func TestBySlicesWeightedEmpty(t *testing.T) { + ss := BySlicesWeighted() + assert.Equal(t, 0, len(ss)) +} diff --git a/pkg/trace/sampler/adjust.go b/pkg/trace/sampler/adjust.go new file mode 100644 index 0000000000000..4692e0a342c34 --- /dev/null +++ b/pkg/trace/sampler/adjust.go @@ -0,0 +1,67 @@ +package sampler + +import ( + "math" +) + +// AdjustScoring modifies sampler coefficients to fit better the `maxTPS` condition +func (s *Sampler) AdjustScoring() { + currentTPS := s.Backend.GetSampledScore() + totalTPS := s.Backend.GetTotalScore() + offset := s.signatureScoreOffset.Load() + cardinality := float64(s.Backend.GetCardinality()) + + newOffset, newSlope := adjustCoefficients(currentTPS, totalTPS, s.maxTPS, offset, cardinality) + + s.SetSignatureCoefficients(newOffset, newSlope) +} + +func adjustCoefficients(currentTPS, totalTPS, maxTPS, offset, cardinality float64) (newOffset, newSlope float64) { + // See how far we are from our maxTPS limit and make signature sampler harder/softer accordingly + TPSratio := currentTPS / maxTPS + + // Compute how much we should change the offset + coefficient := 1.0 + + if TPSratio > 1 { + // If above, reduce the offset + coefficient = 0.8 + // If we keep 3x too many traces, reduce the offset even more + if TPSratio > 3 { + coefficient = 0.5 + } + } else if TPSratio < 0.8 { + // If below, increase the offset + // Don't do it if: + // - we already keep all traces (with a 1% margin because of stats imprecision) + // - offset above maxTPS + if currentTPS < 0.99*totalTPS && offset < maxTPS { + coefficient = 1.1 + if TPSratio < 0.5 { + coefficient = 1.3 + } + } + } + + newOffset = coefficient * offset + + // Safeguard to avoid too small offset (for guaranteed very-low volume sampling) + if newOffset < minSignatureScoreOffset { + newOffset = minSignatureScoreOffset + } + + // Default slope value + newSlope = defaultSignatureScoreSlope + + // Compute the slope based on the signature count distribution + // TODO: explain this formula + if offset < totalTPS { + newSlope = math.Pow(10, math.Log10(cardinality*totalTPS/maxTPS)/math.Log10(totalTPS/minSignatureScoreOffset)) + // That's the max value we should allow. When slope == 10, we basically keep only `offset` traces per signature + if newSlope > 10 { + newSlope = 10 + } + } + + return newOffset, newSlope +} diff --git a/pkg/trace/sampler/adjust_test.go b/pkg/trace/sampler/adjust_test.go new file mode 100644 index 0000000000000..c076f7c342d4f --- /dev/null +++ b/pkg/trace/sampler/adjust_test.go @@ -0,0 +1,32 @@ +package sampler + +import ( + // "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestAdjustCoefficients(t *testing.T) { + assert := assert.New(t) + + for _, a := range [][5]float64{ + // currentTPS, totalTPS, maxTPS, offset, cardinality + [5]float64{10, 50, 15, 0.5, 200}, + } { + currentTPS, totalTPS, maxTPS, offset, cardinality := a[0], a[1], a[2], a[3], a[4] + newOffset, newSlope := adjustCoefficients(currentTPS, totalTPS, maxTPS, offset, cardinality) + + // Whatever the input is, we must always have respect basic bounds + assert.True(newOffset >= minSignatureScoreOffset) + assert.True(newSlope >= 1) + assert.True(newSlope <= 10) + + // Check that we are adjusting in the "good" direction + if currentTPS >= maxTPS { + assert.True(newOffset <= offset) + } else { + assert.True(newOffset >= offset) + } + } +} diff --git a/pkg/trace/sampler/backend.go b/pkg/trace/sampler/backend.go new file mode 100644 index 0000000000000..c479a1886a519 --- /dev/null +++ b/pkg/trace/sampler/backend.go @@ -0,0 +1,34 @@ +package sampler + +// Backend stores and counts traces and signatures ingested by a sampler. +type Backend interface { + // Run runs the blocking execution of the backend main loop. + Run() + + // Stop stops the backend main loop. + Stop() + + // CountSample counts that 1 trace is going through the sampler. + CountSample() + + // CountSignature counts that 1 signature is going through the sampler. + CountSignature(signature Signature) + + // GetTotalScore returns the TPS (Traces Per Second) of all traces ingested. + GetTotalScore() float64 + + // GetSampledScore returns the TPS of all traces sampled. + GetSampledScore() float64 + + // GetUpperSampledScore is similar to GetSampledScore, but with the upper approximation. + GetUpperSampledScore() float64 + + // GetSignatureScore returns the TPS of traces ingested of a given signature. + GetSignatureScore(signature Signature) float64 + + // GetSignatureScores returns the TPS of traces ingested for all signatures. + GetSignatureScores() map[Signature]float64 + + // GetCardinality returns the number of different signatures seen. + GetCardinality() int64 +} diff --git a/pkg/trace/sampler/catalog.go b/pkg/trace/sampler/catalog.go new file mode 100644 index 0000000000000..c1213a23723ec --- /dev/null +++ b/pkg/trace/sampler/catalog.go @@ -0,0 +1,44 @@ +package sampler + +import "sync" + +const defaultServiceRateKey = "service:,env:" + +// serviceKeyCatalog reverse-maps service signatures to their generated hashes for +// easy look up. +type serviceKeyCatalog struct { + mu sync.Mutex + lookup map[ServiceSignature]Signature +} + +// newServiceLookup returns a new serviceKeyCatalog. +func newServiceLookup() *serviceKeyCatalog { + return &serviceKeyCatalog{ + lookup: make(map[ServiceSignature]Signature), + } +} + +func (cat *serviceKeyCatalog) register(svcSig ServiceSignature) Signature { + hash := svcSig.Hash() + cat.mu.Lock() + cat.lookup[svcSig] = hash + cat.mu.Unlock() + return hash +} + +// ratesByService returns a map of service signatures mapping to the rates identified using +// the signatures. +func (cat serviceKeyCatalog) ratesByService(rates map[Signature]float64, totalScore float64) map[ServiceSignature]float64 { + rbs := make(map[ServiceSignature]float64, len(rates)+1) + defer cat.mu.Unlock() + cat.mu.Lock() + for key, sig := range cat.lookup { + if rate, ok := rates[sig]; ok { + rbs[key] = rate + } else { + delete(cat.lookup, key) + } + } + rbs[ServiceSignature{}] = totalScore + return rbs +} diff --git a/pkg/trace/sampler/catalog_test.go b/pkg/trace/sampler/catalog_test.go new file mode 100644 index 0000000000000..7604d85c7c526 --- /dev/null +++ b/pkg/trace/sampler/catalog_test.go @@ -0,0 +1,85 @@ +package sampler + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestServiceSignatureString(t *testing.T) { + assert := assert.New(t) + + assert.Equal(defaultServiceRateKey, ServiceSignature{}.String()) + assert.Equal("service:mcnulty,env:test", ServiceSignature{"mcnulty", "test"}.String()) +} + +func TestNewServiceLookup(t *testing.T) { + cat := newServiceLookup() + assert.NotNil(t, cat.lookup) +} + +func TestServiceKeyCatalogRegister(t *testing.T) { + assert := assert.New(t) + + cat := newServiceLookup() + s := getTestPriorityEngine() + + _, root1 := getTestTraceWithService(t, "service1", s) + sig1 := cat.register(ServiceSignature{root1.Service, defaultEnv}) + assert.Equal( + map[ServiceSignature]Signature{ + ServiceSignature{"service1", "none"}: sig1, + }, + cat.lookup, + ) + + _, root2 := getTestTraceWithService(t, "service2", s) + sig2 := cat.register(ServiceSignature{root2.Service, defaultEnv}) + assert.Equal( + map[ServiceSignature]Signature{ + ServiceSignature{"service1", "none"}: sig1, + ServiceSignature{"service2", "none"}: sig2, + }, + cat.lookup, + ) +} + +func TestServiceKeyCatalogRatesByService(t *testing.T) { + assert := assert.New(t) + + cat := newServiceLookup() + s := getTestPriorityEngine() + + _, root1 := getTestTraceWithService(t, "service1", s) + sig1 := cat.register(ServiceSignature{root1.Service, defaultEnv}) + _, root2 := getTestTraceWithService(t, "service2", s) + sig2 := cat.register(ServiceSignature{root2.Service, defaultEnv}) + + rates := map[Signature]float64{ + sig1: 0.3, + sig2: 0.7, + } + const totalRate = 0.2 + + rateByService := cat.ratesByService(rates, totalRate) + assert.Equal(map[ServiceSignature]float64{ + ServiceSignature{"service1", "none"}: 0.3, + ServiceSignature{"service2", "none"}: 0.7, + ServiceSignature{}: 0.2, + }, rateByService) + + delete(rates, sig1) + + rateByService = cat.ratesByService(rates, totalRate) + assert.Equal(map[ServiceSignature]float64{ + ServiceSignature{"service2", "none"}: 0.7, + ServiceSignature{}: 0.2, + }, rateByService) + + delete(rates, sig2) + + rateByService = cat.ratesByService(rates, totalRate) + assert.Equal(map[ServiceSignature]float64{ + ServiceSignature{}: 0.2, + }, rateByService) +} diff --git a/pkg/trace/sampler/coresampler.go b/pkg/trace/sampler/coresampler.go new file mode 100644 index 0000000000000..5597bd3378f57 --- /dev/null +++ b/pkg/trace/sampler/coresampler.go @@ -0,0 +1,163 @@ +package sampler + +import ( + "math" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" +) + +const ( + // Sampler parameters not (yet?) configurable + defaultDecayPeriod time.Duration = 5 * time.Second + // With this factor, any past trace counts for less than 50% after 6*decayPeriod and >1% after 39*decayPeriod + // We can keep it hardcoded, but having `decayPeriod` configurable should be enough? + defaultDecayFactor float64 = 1.125 // 9/8 + adjustPeriod time.Duration = 10 * time.Second + initialSignatureScoreOffset float64 = 1 + minSignatureScoreOffset float64 = 0.01 + defaultSignatureScoreSlope float64 = 3 +) + +// EngineType represents the type of a sampler engine. +type EngineType int + +const ( + // NormalScoreEngineType is the type of the ScoreEngine sampling non-error traces. + NormalScoreEngineType EngineType = iota + // ErrorsScoreEngineType is the type of the ScoreEngine sampling error traces. + ErrorsScoreEngineType + // PriorityEngineType is type of the priority sampler engine type. + PriorityEngineType +) + +// Engine is a common basic interface for sampler engines. +type Engine interface { + // Run the sampler. + Run() + // Stop the sampler. + Stop() + // Sample a trace. + Sample(trace pb.Trace, root *pb.Span, env string) (sampled bool, samplingRate float64) + // GetState returns information about the sampler. + GetState() interface{} + // GetType returns the type of the sampler. + GetType() EngineType +} + +// Sampler is the main component of the sampling logic +type Sampler struct { + // Storage of the state of the sampler + Backend Backend + + // Extra sampling rate to combine to the existing sampling + extraRate float64 + // Maximum limit to the total number of traces per second to sample + maxTPS float64 + + // Sample any signature with a score lower than scoreSamplingOffset + // It is basically the number of similar traces per second after which we start sampling + signatureScoreOffset *atomicFloat64 + // Logarithm slope for the scoring function + signatureScoreSlope *atomicFloat64 + // signatureScoreFactor = math.Pow(signatureScoreSlope, math.Log10(scoreSamplingOffset)) + signatureScoreFactor *atomicFloat64 + + exit chan struct{} +} + +// newSampler returns an initialized Sampler +func newSampler(extraRate float64, maxTPS float64) *Sampler { + s := &Sampler{ + Backend: NewMemoryBackend(defaultDecayPeriod, defaultDecayFactor), + extraRate: extraRate, + maxTPS: maxTPS, + signatureScoreOffset: newFloat64(0), + signatureScoreSlope: newFloat64(0), + signatureScoreFactor: newFloat64(0), + + exit: make(chan struct{}), + } + + s.SetSignatureCoefficients(initialSignatureScoreOffset, defaultSignatureScoreSlope) + + return s +} + +// SetSignatureCoefficients updates the internal scoring coefficients used by the signature scoring +func (s *Sampler) SetSignatureCoefficients(offset float64, slope float64) { + s.signatureScoreOffset.Store(offset) + s.signatureScoreSlope.Store(slope) + s.signatureScoreFactor.Store(math.Pow(slope, math.Log10(offset))) +} + +// UpdateExtraRate updates the extra sample rate +func (s *Sampler) UpdateExtraRate(extraRate float64) { + s.extraRate = extraRate +} + +// UpdateMaxTPS updates the max TPS limit +func (s *Sampler) UpdateMaxTPS(maxTPS float64) { + s.maxTPS = maxTPS +} + +// Run runs and block on the Sampler main loop +func (s *Sampler) Run() { + go func() { + defer watchdog.LogOnPanic() + s.Backend.Run() + }() + s.RunAdjustScoring() +} + +// Stop stops the main Run loop +func (s *Sampler) Stop() { + s.Backend.Stop() + close(s.exit) +} + +// RunAdjustScoring is the sampler feedback loop to adjust the scoring coefficients +func (s *Sampler) RunAdjustScoring() { + t := time.NewTicker(adjustPeriod) + defer t.Stop() + + for { + select { + case <-t.C: + s.AdjustScoring() + case <-s.exit: + return + } + } +} + +// GetSampleRate returns the sample rate to apply to a trace. +func (s *Sampler) GetSampleRate(trace pb.Trace, root *pb.Span, signature Signature) float64 { + rate := s.GetSignatureSampleRate(signature) * s.extraRate + + return rate +} + +// GetMaxTPSSampleRate returns an extra sample rate to apply if we are above maxTPS. +func (s *Sampler) GetMaxTPSSampleRate() float64 { + // When above maxTPS, apply an additional sample rate to statistically respect the limit + maxTPSrate := 1.0 + if s.maxTPS > 0 { + currentTPS := s.Backend.GetUpperSampledScore() + if currentTPS > s.maxTPS { + maxTPSrate = s.maxTPS / currentTPS + } + } + + return maxTPSrate +} + +// CombineRates merges two rates from Sampler1, Sampler2. Both samplers law are independant, +// and {sampled} = {sampled by Sampler1} or {sampled by Sampler2} +func CombineRates(rate1 float64, rate2 float64) float64 { + if rate1 >= 1 || rate2 >= 1 { + return 1 + } + return rate1 + rate2 - rate1*rate2 +} diff --git a/pkg/trace/sampler/coresampler_test.go b/pkg/trace/sampler/coresampler_test.go new file mode 100644 index 0000000000000..de745826e4b56 --- /dev/null +++ b/pkg/trace/sampler/coresampler_test.go @@ -0,0 +1,84 @@ +package sampler + +import ( + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" + "github.com/stretchr/testify/assert" +) + +func getTestSampler() *Sampler { + // Disable debug logs in these tests + log.UseLogger(log.Disabled) + + // No extra fixed sampling, no maximum TPS + extraRate := 1.0 + maxTPS := 0.0 + + return newSampler(extraRate, maxTPS) +} + +func TestSamplerAccessRace(t *testing.T) { + // regression test: even though the sampler is channel protected, it + // has getters accessing its fields. + s := newSampler(1, 2) + go func() { + for i := 0; i < 10000; i++ { + s.SetSignatureCoefficients(float64(i), float64(i)/2) + } + }() + for i := 0; i < 5000; i++ { + s.GetState() + s.GetAllCountScores() + } +} + +func TestSamplerLoop(t *testing.T) { + s := getTestSampler() + + exit := make(chan bool) + + go func() { + s.Run() + close(exit) + }() + + s.Stop() + + select { + case <-exit: + return + case <-time.After(time.Second * 1): + assert.Fail(t, "Sampler took more than 1 second to close") + } +} + +func TestCombineRates(t *testing.T) { + var combineRatesTests = []struct { + rate1, rate2 float64 + expected float64 + }{ + {0.1, 1.0, 1.0}, + {0.3, 0.2, 0.44}, + {0.0, 0.5, 0.5}, + } + for _, tt := range combineRatesTests { + assert.Equal(t, tt.expected, CombineRates(tt.rate1, tt.rate2)) + assert.Equal(t, tt.expected, CombineRates(tt.rate2, tt.rate1)) + } +} + +func TestAddSampleRate(t *testing.T) { + assert := assert.New(t) + tID := randomTraceID() + + root := &pb.Span{TraceID: tID, SpanID: 1, ParentID: 0, Start: 123, Duration: 100000, Service: "mcnulty", Type: "web"} + + AddGlobalRate(root, 0.4) + assert.Equal(0.4, root.Metrics["_sample_rate"], "sample rate should be 40%%") + + AddGlobalRate(root, 0.5) + assert.Equal(0.2, root.Metrics["_sample_rate"], "sample rate should be 20%% (50%% of 40%%)") +} diff --git a/pkg/trace/sampler/dynamic_config.go b/pkg/trace/sampler/dynamic_config.go new file mode 100644 index 0000000000000..ba5906a770696 --- /dev/null +++ b/pkg/trace/sampler/dynamic_config.go @@ -0,0 +1,70 @@ +package sampler + +import ( + "sync" +) + +// DynamicConfig contains configuration items which may change +// dynamically over time. +type DynamicConfig struct { + // RateByService contains the rate for each service/env tuple, + // used in priority sampling by client libs. + RateByService RateByService +} + +// NewDynamicConfig creates a new dynamic config object which maps service signatures +// to their corresponding sampling rates. Each service will have a default assigned +// matching the service rate of the specified env. +func NewDynamicConfig(env string) *DynamicConfig { + return &DynamicConfig{RateByService: RateByService{defaultEnv: env}} +} + +// RateByService stores the sampling rate per service. It is thread-safe, so +// one can read/write on it concurrently, using getters and setters. +type RateByService struct { + defaultEnv string // env. to use for service defaults + + mu sync.RWMutex // guards rates + rates map[string]float64 +} + +// SetAll the sampling rate for all services. If a service/env is not +// in the map, then the entry is removed. +func (rbs *RateByService) SetAll(rates map[ServiceSignature]float64) { + rbs.mu.Lock() + defer rbs.mu.Unlock() + + if rbs.rates == nil { + rbs.rates = make(map[string]float64, len(rates)) + } + for k := range rbs.rates { + delete(rbs.rates, k) + } + for k, v := range rates { + if v < 0 { + v = 0 + } + if v > 1 { + v = 1 + } + rbs.rates[k.String()] = v + if k.Env == rbs.defaultEnv { + // if this is the default env, then this is also the + // service's default rate unbound to any env. + rbs.rates[ServiceSignature{Name: k.Name}.String()] = v + } + } +} + +// GetAll returns all sampling rates for all services. +func (rbs *RateByService) GetAll() map[string]float64 { + rbs.mu.RLock() + defer rbs.mu.RUnlock() + + ret := make(map[string]float64, len(rbs.rates)) + for k, v := range rbs.rates { + ret[k] = v + } + + return ret +} diff --git a/pkg/trace/sampler/dynamic_config_test.go b/pkg/trace/sampler/dynamic_config_test.go new file mode 100644 index 0000000000000..1a6df835d199a --- /dev/null +++ b/pkg/trace/sampler/dynamic_config_test.go @@ -0,0 +1,198 @@ +package sampler + +import ( + "strconv" + "sync" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestNewDynamicConfig(t *testing.T) { + assert := assert.New(t) + + dc := NewDynamicConfig("none") + assert.NotNil(dc) + + rates := map[ServiceSignature]float64{ + ServiceSignature{"myservice", "myenv"}: 0.5, + } + + // Not doing a complete test of the different components of dynamic config, + // but still assessing it can do the bare minimum once initialized. + dc.RateByService.SetAll(rates) + rbs := dc.RateByService.GetAll() + assert.Equal(map[string]float64{"service:myservice,env:myenv": 0.5}, rbs) +} + +func TestRateByServiceGetSet(t *testing.T) { + var rbc RateByService + for i, tc := range []struct { + in map[ServiceSignature]float64 + out map[string]float64 + }{ + { + in: map[ServiceSignature]float64{ + ServiceSignature{}: 0.1, + }, + out: map[string]float64{ + "service:,env:": 0.1, + }, + }, + { + in: map[ServiceSignature]float64{ + ServiceSignature{}: 0.3, + ServiceSignature{"mcnulty", "dev"}: 0.2, + ServiceSignature{"postgres", "dev"}: 0.1, + }, + out: map[string]float64{ + "service:,env:": 0.3, + "service:mcnulty,env:dev": 0.2, + "service:postgres,env:dev": 0.1, + }, + }, + { + in: map[ServiceSignature]float64{ + ServiceSignature{}: 1, + }, + out: map[string]float64{ + "service:,env:": 1, + }, + }, + { + out: map[string]float64{}, + }, + { + in: map[ServiceSignature]float64{ + ServiceSignature{}: 0.2, + }, + out: map[string]float64{ + "service:,env:": 0.2, + }, + }, + } { + rbc.SetAll(tc.in) + assert.Equal(t, tc.out, rbc.GetAll(), strconv.Itoa(i)) + } +} + +func TestRateByServiceLimits(t *testing.T) { + assert := assert.New(t) + + var rbc RateByService + rbc.SetAll(map[ServiceSignature]float64{ + ServiceSignature{"high", ""}: 2, + ServiceSignature{"low", ""}: -1, + }) + assert.Equal(map[string]float64{"service:high,env:": 1, "service:low,env:": 0}, rbc.GetAll()) +} + +func TestRateByServiceDefaults(t *testing.T) { + rbc := RateByService{defaultEnv: "test"} + rbc.SetAll(map[ServiceSignature]float64{ + ServiceSignature{"one", "prod"}: 0.5, + ServiceSignature{"two", "test"}: 0.4, + }) + assert.Equal(t, map[string]float64{ + "service:one,env:prod": 0.5, + "service:two,env:test": 0.4, + "service:two,env:": 0.4, + }, rbc.GetAll()) +} + +func TestRateByServiceConcurrency(t *testing.T) { + assert := assert.New(t) + + var rbc RateByService + + const n = 1000 + var wg sync.WaitGroup + wg.Add(2) + + rbc.SetAll(map[ServiceSignature]float64{ServiceSignature{"mcnulty", "test"}: 1}) + go func() { + for i := 0; i < n; i++ { + rate := float64(i) / float64(n) + rbc.SetAll(map[ServiceSignature]float64{ServiceSignature{"mcnulty", "test"}: rate}) + } + wg.Done() + }() + go func() { + for i := 0; i < n; i++ { + rates := rbc.GetAll() + _, ok := rates["service:mcnulty,env:test"] + assert.True(ok, "key should be here") + } + wg.Done() + }() +} + +func benchRBSGetAll(sigs map[ServiceSignature]float64) func(*testing.B) { + return func(b *testing.B) { + rbs := &RateByService{defaultEnv: "test"} + rbs.SetAll(sigs) + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + rbs.GetAll() + } + } +} + +func benchRBSSetAll(sigs map[ServiceSignature]float64) func(*testing.B) { + return func(b *testing.B) { + rbs := &RateByService{defaultEnv: "test"} + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + rbs.SetAll(sigs) + } + } +} + +func BenchmarkRateByService(b *testing.B) { + sigs := map[ServiceSignature]float64{ + ServiceSignature{}: 0.2, + ServiceSignature{"two", "test"}: 0.4, + ServiceSignature{"three", "test"}: 0.33, + ServiceSignature{"one", "prod"}: 0.12, + ServiceSignature{"five", "test"}: 0.8, + ServiceSignature{"six", "staging"}: 0.9, + } + + b.Run("GetAll", func(b *testing.B) { + for i := 1; i <= len(sigs); i++ { + // take first i elements + testSigs := make(map[ServiceSignature]float64, i) + var j int + for k, v := range sigs { + j++ + testSigs[k] = v + if j == i { + break + } + } + b.Run(strconv.Itoa(i), benchRBSGetAll(testSigs)) + } + }) + + b.Run("SetAll", func(b *testing.B) { + for i := 1; i <= len(sigs); i++ { + // take first i elements + testSigs := make(map[ServiceSignature]float64, i) + var j int + for k, v := range sigs { + j++ + testSigs[k] = v + if j == i { + break + } + } + b.Run(strconv.Itoa(i), benchRBSSetAll(testSigs)) + } + }) +} diff --git a/pkg/trace/sampler/float64.go b/pkg/trace/sampler/float64.go new file mode 100644 index 0000000000000..9a4903185e69f --- /dev/null +++ b/pkg/trace/sampler/float64.go @@ -0,0 +1,49 @@ +package sampler + +import ( + "math" + "sync/atomic" +) + +// The atomic float64 is copied from: https://github.com/uber-go/atomic/blob/master/atomic.go#L267 + +// atomicFloat64 is an atomic wrapper around float64. +type atomicFloat64 struct { + v uint64 +} + +// newFloat64 creates a atomicFloat64. +func newFloat64(f float64) *atomicFloat64 { + return &atomicFloat64{math.Float64bits(f)} +} + +// Load atomically loads the wrapped value. +func (f *atomicFloat64) Load() float64 { + return math.Float64frombits(atomic.LoadUint64(&f.v)) +} + +// Store atomically stores the passed value. +func (f *atomicFloat64) Store(s float64) { + atomic.StoreUint64(&f.v, math.Float64bits(s)) +} + +// Add atomically adds to the wrapped float64 and returns the new value. +func (f *atomicFloat64) Add(s float64) float64 { + for { + old := f.Load() + new := old + s + if f.CAS(old, new) { + return new + } + } +} + +// Sub atomically subtracts from the wrapped float64 and returns the new value. +func (f *atomicFloat64) Sub(s float64) float64 { + return f.Add(-s) +} + +// CAS is an atomic compare-and-swap. +func (f *atomicFloat64) CAS(old, new float64) bool { + return atomic.CompareAndSwapUint64(&f.v, math.Float64bits(old), math.Float64bits(new)) +} diff --git a/pkg/trace/sampler/memory_backend.go b/pkg/trace/sampler/memory_backend.go new file mode 100644 index 0000000000000..2bdf75da5beb7 --- /dev/null +++ b/pkg/trace/sampler/memory_backend.go @@ -0,0 +1,163 @@ +package sampler + +import ( + "sync" + "time" +) + +// MemoryBackend storing any state required to run the sampling algorithms. +// +// Current implementation is only based on counters with polynomial decay. +// Its bias with steady counts is 1 * decayFactor. +// The stored scores represent approximation of the real count values (with a countScaleFactor factor). +type MemoryBackend struct { + // scores maps signatures to scores. + scores map[Signature]float64 + + // totalScore holds the score sum of all traces (equals the sum of all signature scores). + totalScore float64 + + // sampledScore is the score of all sampled traces. + sampledScore float64 + + // mu is a lock protecting all the scores. + mu sync.RWMutex + + // decayPeriod is the time period between each score decay. + // A lower value is more reactive, but forgets quicker. + decayPeriod time.Duration + + // decayFactor is how much we reduce/divide the score at every decay run. + // A lower value is more reactive, but forgets quicker. + decayFactor float64 + + // countScaleFactor is the factor to apply to move from the score + // to the representing number of traces per second. + // By definition of the decay formula is: + // countScaleFactor = (decayFactor / (decayFactor - 1)) * decayPeriod + // It also represents by how much a spike is smoothed: if we instantly + // receive N times the same signature, its immediate count will be + // increased by N / countScaleFactor. + countScaleFactor float64 + + // exit is the channel to close to stop the run loop. + exit chan struct{} +} + +// NewMemoryBackend returns an initialized Backend. +func NewMemoryBackend(decayPeriod time.Duration, decayFactor float64) *MemoryBackend { + return &MemoryBackend{ + scores: make(map[Signature]float64), + decayPeriod: decayPeriod, + decayFactor: decayFactor, + countScaleFactor: (decayFactor / (decayFactor - 1)) * decayPeriod.Seconds(), + exit: make(chan struct{}), + } +} + +// Run runs and block on the Sampler main loop. +func (b *MemoryBackend) Run() { + t := time.NewTicker(b.decayPeriod) + defer t.Stop() + + for { + select { + case <-t.C: + b.decayScore() + case <-b.exit: + return + } + } +} + +// Stop stops the main Run loop. +func (b *MemoryBackend) Stop() { + close(b.exit) +} + +// CountSignature counts an incoming signature. +func (b *MemoryBackend) CountSignature(signature Signature) { + b.mu.Lock() + b.scores[signature]++ + b.totalScore++ + b.mu.Unlock() +} + +// CountSample counts a trace sampled by the sampler. +func (b *MemoryBackend) CountSample() { + b.mu.Lock() + b.sampledScore++ + b.mu.Unlock() +} + +// GetSignatureScore returns the score of a signature. +// It is normalized to represent a number of signatures per second. +func (b *MemoryBackend) GetSignatureScore(signature Signature) float64 { + b.mu.RLock() + score := b.scores[signature] / b.countScaleFactor + b.mu.RUnlock() + + return score +} + +// GetSignatureScores returns the scores for all signatures. +// It is normalized to represent a number of signatures per second. +func (b *MemoryBackend) GetSignatureScores() map[Signature]float64 { + b.mu.RLock() + scores := make(map[Signature]float64, len(b.scores)) + for signature, score := range b.scores { + scores[signature] = score / b.countScaleFactor + } + b.mu.RUnlock() + + return scores +} + +// GetSampledScore returns the global score of all sampled traces. +func (b *MemoryBackend) GetSampledScore() float64 { + b.mu.RLock() + score := b.sampledScore / b.countScaleFactor + b.mu.RUnlock() + + return score +} + +// GetTotalScore returns the global score of all sampled traces. +func (b *MemoryBackend) GetTotalScore() float64 { + b.mu.RLock() + score := b.totalScore / b.countScaleFactor + b.mu.RUnlock() + + return score +} + +// GetUpperSampledScore returns a certain upper bound of the global count of all sampled traces. +func (b *MemoryBackend) GetUpperSampledScore() float64 { + // Overestimate the real score with the high limit of the backend bias. + return b.GetSampledScore() * b.decayFactor +} + +// GetCardinality returns the number of different signatures seen recently. +func (b *MemoryBackend) GetCardinality() int64 { + b.mu.RLock() + cardinality := int64(len(b.scores)) + b.mu.RUnlock() + + return cardinality +} + +// decayScore applies the decay to the rolling counters. +func (b *MemoryBackend) decayScore() { + b.mu.Lock() + for sig := range b.scores { + if b.scores[sig] > b.decayFactor*minSignatureScoreOffset { + b.scores[sig] /= b.decayFactor + } else { + // When the score is too small, we can optimize by simply dropping the entry. + delete(b.scores, sig) + } + } + b.totalScore /= b.decayFactor + b.sampledScore /= b.decayFactor + b.mu.Unlock() +} diff --git a/pkg/trace/sampler/memory_backend_test.go b/pkg/trace/sampler/memory_backend_test.go new file mode 100644 index 0000000000000..44547e93559e6 --- /dev/null +++ b/pkg/trace/sampler/memory_backend_test.go @@ -0,0 +1,92 @@ +package sampler + +import ( + "math/rand" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func getTestBackend() *MemoryBackend { + decayPeriod := 5 * time.Second + + return NewMemoryBackend(decayPeriod, defaultDecayFactor) +} + +func randomSignature() Signature { + return Signature(rand.Int63()) +} + +func TestBasicNewBackend(t *testing.T) { + assert := assert.New(t) + + backend := getTestBackend() + + sign := randomSignature() + backend.CountSignature(sign) + + assert.True(backend.GetSignatureScore(sign) > 0.0) + assert.Equal(0.0, backend.GetSignatureScore(randomSignature())) +} + +func TestCountScoreConvergence(t *testing.T) { + // With a constant number of tracesPerPeriod, the backend score should converge to tracesPerPeriod + // Test the convergence of both signature and total sampled counters + backend := getTestBackend() + + sign := randomSignature() + + periods := 50 + tracesPerPeriod := 1000 + period := backend.decayPeriod + + for period := 0; period < periods; period++ { + backend.decayScore() + for i := 0; i < tracesPerPeriod; i++ { + backend.CountSignature(sign) + backend.CountSample() + } + } + + assert.InEpsilon(t, backend.GetSignatureScore(sign), float64(tracesPerPeriod)/period.Seconds(), 0.01) + assert.InEpsilon(t, backend.GetSampledScore(), float64(tracesPerPeriod)/period.Seconds(), 0.01) +} + +func TestCountScoreOblivion(t *testing.T) { + // After some time, past traces shouldn't impact the score + assert := assert.New(t) + backend := getTestBackend() + + sign := randomSignature() + + // Number of tracesPerPeriod in the initial phase + tracesPerPeriod := 1000 + ticks := 50 + + for period := 0; period < ticks; period++ { + backend.decayScore() + for i := 0; i < tracesPerPeriod; i++ { + backend.CountSignature(sign) + } + } + + // Second phase: we stop receiving this signature + + // How long to wait until score is >50% the initial score (TODO: make it function of the config) + halfLifePeriods := 6 + // How long to wait until score is >1% the initial score + oblivionPeriods := 40 + + for period := 0; period < halfLifePeriods; period++ { + backend.decayScore() + } + + assert.True(backend.GetSignatureScore(sign) < 0.5*float64(tracesPerPeriod)) + + for period := 0; period < oblivionPeriods-halfLifePeriods; period++ { + backend.decayScore() + } + + assert.True(backend.GetSignatureScore(sign) < 0.01*float64(tracesPerPeriod)) +} diff --git a/pkg/trace/sampler/presampler.go b/pkg/trace/sampler/presampler.go new file mode 100644 index 0000000000000..b729f99ec2e92 --- /dev/null +++ b/pkg/trace/sampler/presampler.go @@ -0,0 +1,241 @@ +package sampler + +// [TODO:christian] publish all through expvar, but wait until the PR +// with cpu watchdog is merged as there are probably going to be git conflicts... + +import ( + "fmt" + "net/http" + "strconv" + "sync" + "time" + + log "github.com/cihub/seelog" +) + +const ( + // TraceCountHeader is the header client implementation should fill + // with the number of traces contained in the payload. + TraceCountHeader = "X-Datadog-Trace-Count" +) + +// PreSamplerStats contains pre-sampler data. The public content +// might be interesting for statistics, logging. +type PreSamplerStats struct { + // Rate is the target pre-sampling rate. + Rate float64 + // Error is the last error got when trying to calc the pre-sampling rate. + // Stored as a string as this is easier to marshal & publish in JSON. + Error string + // RecentPayloadsSeen is the number of payloads that passed by. + RecentPayloadsSeen float64 + // RecentTracesSeen is the number of traces that passed by. + RecentTracesSeen float64 + // RecentTracesDropped is the number of traces that were dropped. + RecentTracesDropped float64 +} + +// PreSampler tries to tell wether we should keep a payload, even +// before fully processing it. Its only clues are the unparsed payload +// and the HTTP headers. It should remain very light and fast. +type PreSampler struct { + stats PreSamplerStats + decayPeriod time.Duration + decayFactor float64 + mu sync.RWMutex // needed since many requests can run in parallel + exit chan struct{} +} + +// NewPreSampler returns an initialized presampler +func NewPreSampler() *PreSampler { + decayFactor := 9.0 / 8.0 + return &PreSampler{ + stats: PreSamplerStats{ + Rate: 1, + }, + decayPeriod: defaultDecayPeriod, + decayFactor: decayFactor, + exit: make(chan struct{}), + } +} + +// Run runs and block on the Sampler main loop +func (ps *PreSampler) Run() { + t := time.NewTicker(ps.decayPeriod) + defer t.Stop() + + for { + select { + case <-t.C: + ps.decayScore() + case <-ps.exit: + return + } + } +} + +// Stop stops the main Run loop +func (ps *PreSampler) Stop() { + close(ps.exit) +} + +// SetRate set the pre-sample rate, thread-safe. +func (ps *PreSampler) SetRate(rate float64) { + ps.mu.Lock() + ps.stats.Rate = rate + ps.mu.Unlock() +} + +// Rate returns the current target pre-sample rate, thread-safe. +// The target pre-sample rate is the value set with SetRate, ideally this +// is the sample rate, but depending on what is received, the real rate +// might defer. +func (ps *PreSampler) Rate() float64 { + ps.mu.RLock() + rate := ps.stats.Rate + ps.mu.RUnlock() + return rate +} + +// SetError set the pre-sample error, thread-safe. +func (ps *PreSampler) SetError(err error) { + ps.mu.Lock() + if err != nil { + ps.stats.Error = err.Error() + } else { + ps.stats.Error = "" + } + ps.mu.Unlock() +} + +// RealRate returns the current real pre-sample rate, thread-safe. +// This is the value obtained by counting what was kept and dropped. +func (ps *PreSampler) RealRate() float64 { + ps.mu.RLock() + rate := ps.stats.RealRate() + ps.mu.RUnlock() + return rate +} + +// RealRate calcuates the current real pre-sample rate from +// the stats data. If no data is available, returns the target rate. +func (stats *PreSamplerStats) RealRate() float64 { + if stats.RecentTracesSeen <= 0 { // careful with div by 0 + return stats.Rate + } + return 1 - (stats.RecentTracesDropped / stats.RecentTracesSeen) +} + +// Stats returns a copy of the currrent pre-sampler stats. +func (ps *PreSampler) Stats() *PreSamplerStats { + ps.mu.RLock() + stats := ps.stats + ps.mu.RUnlock() + return &stats +} + +func (ps *PreSampler) sampleWithCount(traceCount int64) bool { + if traceCount <= 0 { + return true // no sensible value in traceCount, disable pre-sampling + } + + keep := true + + ps.mu.Lock() + + if ps.stats.RealRate() > ps.stats.Rate { + // Too many things processed, drop the current payload. + keep = false + ps.stats.RecentTracesDropped += float64(traceCount) + } + + // This should be done *after* testing RealRate() against Rate, + // else we could end up systematically dropping the first payload. + ps.stats.RecentPayloadsSeen++ + ps.stats.RecentTracesSeen += float64(traceCount) + + ps.mu.Unlock() + + if !keep { + log.Debugf("pre-sampling at rate %f dropped payload with %d traces", ps.Rate(), traceCount) + } + + return keep +} + +// Sample tells wether a given request should be kept (true means: "yes, keep it"). +// Calling this alters the statistics, it affects the result of RealRate() so +// only call it once per payload. +func (ps *PreSampler) Sample(req *http.Request) bool { + traceCount := int64(0) + if traceCountStr := req.Header.Get(TraceCountHeader); traceCountStr != "" { + var err error + traceCount, err = strconv.ParseInt(traceCountStr, 10, 64) + if err != nil { + log.Errorf("unable to parse HTTP header %s: %s", TraceCountHeader, traceCountStr) + } + } + + return ps.sampleWithCount(traceCount) +} + +// decayScore applies the decay to the rolling counters +func (ps *PreSampler) decayScore() { + ps.mu.Lock() + + ps.stats.RecentPayloadsSeen /= ps.decayFactor + ps.stats.RecentTracesSeen /= ps.decayFactor + ps.stats.RecentTracesDropped /= ps.decayFactor + + ps.mu.Unlock() +} + +// CalcPreSampleRate gives the new sample rate to apply for a given max user CPU average. +// It takes the current sample rate and user CPU average as those parameters both +// have an influence on the result. +func CalcPreSampleRate(maxUserAvg, currentUserAvg, currentRate float64) (float64, error) { + const ( + // deltaMin is a threshold that must be passed before changing the + // pre-sampling rate. If set to 0.1, for example, the new rate must be + // below 90% or above 110% of the previous value, before we actually + // adjust the sampling rate. This is to avoid over-adapting and jittering. + deltaMin = float64(0.15) // +/- 15% change + // rateMin is an absolute minimum rate, never sample more than this, it is + // inefficient, the cost handling the payloads without even reading them + // is too high anyway. + rateMin = float64(0.05) // 5% hard-limit + ) + + if maxUserAvg <= 0 || currentUserAvg < 0 || currentRate < 0 || currentRate > 1 { + return 1, fmt.Errorf("inconsistent pre-sampling input maxUserAvg=%f currentUserAvg=%f currentRate=%f", + maxUserAvg, currentUserAvg, currentRate) + } + if currentUserAvg == 0 || currentRate == 0 { + return 1, nil // not initialized yet, beside, need to return now else divide by zero error + } + + newRate := float64(1) + slope := currentUserAvg / currentRate + newRate = maxUserAvg / slope + if newRate >= 1 { + return 1, nil // no need to pre-sample anything + } + + delta := (newRate - currentRate) / currentRate + if delta > -deltaMin && delta < deltaMin { + // no need to change, this is close enough to what we want (avoid jittering) + return currentRate, nil + } + + // Taking the average of both values, it is going to converge in the long run, + // but no need to hurry, wait for next iteration. + newRate = (newRate + currentRate) / 2 + + if newRate < rateMin { + // Here, we would need a too-aggressive sampling rate to cope with + // our objective, and pre-sampling is not the right tool any more. + return rateMin, fmt.Errorf("raising pre-sampling rate from %0.1f %% to %0.1f %%", newRate*100, rateMin*100) + } + + return newRate, nil +} diff --git a/pkg/trace/sampler/presampler_test.go b/pkg/trace/sampler/presampler_test.go new file mode 100644 index 0000000000000..207bda4d90990 --- /dev/null +++ b/pkg/trace/sampler/presampler_test.go @@ -0,0 +1,171 @@ +package sampler + +import ( + "fmt" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestCalcPreSampleRate(t *testing.T) { + assert := assert.New(t) + + // [0] -> maxUserAvg: the value in the conf file + // [1] -> currentUserAvg: the value reported by the CPU watchdog + // [2] -> currentRate: the current (pre)sampling rate + expected := map[struct { + maxUserAvg float64 // the value in the conf file + currentUserAvg float64 // the value reported by the CPU watchdog + currentRate float64 // the current (pre)sampling rate + }]struct { + r float64 + err error + }{ + // Various cases showing general usage + {maxUserAvg: 0.1, currentUserAvg: 0.1, currentRate: 1}: {r: 1, err: nil}, + {maxUserAvg: 0.2, currentUserAvg: 0.1, currentRate: 1}: {r: 1, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.15, currentRate: 1}: {r: 0.8333333333333334, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.2, currentRate: 1}: {r: 0.75, err: nil}, + {maxUserAvg: 0.2, currentUserAvg: 1, currentRate: 1}: {r: 0.6, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.11, currentRate: 1}: {r: 1, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.09, currentRate: 1}: {r: 1, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.05, currentRate: 1}: {r: 1, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.11, currentRate: 0.5}: {r: 0.5, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.5, currentRate: 0.5}: {r: 0.3, err: nil}, + {maxUserAvg: 0.15, currentUserAvg: 0.05, currentRate: 0.5}: {r: 1, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 0.05, currentRate: 0.1}: {r: 0.15000000000000002, err: nil}, + {maxUserAvg: 0.04, currentUserAvg: 0.05, currentRate: 1}: {r: 0.8999999999999999, err: nil}, + {maxUserAvg: 0.025, currentUserAvg: 0.05, currentRate: 1}: {r: 0.75, err: nil}, + {maxUserAvg: 0.01, currentUserAvg: 0.05, currentRate: 0.1}: {r: 0.060000000000000005, err: nil}, + + // Check it's back to 1 even if current sampling rate is close to 1 + {maxUserAvg: 0.01, currentUserAvg: 0.005, currentRate: 0.99}: {r: 1, err: nil}, + + // Anti-jittering thing (not doing anything if target is too close to current) + {maxUserAvg: 5, currentUserAvg: 3, currentRate: 0.5}: {r: 0.6666666666666667, err: nil}, + {maxUserAvg: 5, currentUserAvg: 4, currentRate: 0.5}: {r: 0.5625, err: nil}, + {maxUserAvg: 5, currentUserAvg: 4.5, currentRate: 0.5}: {r: 0.5, err: nil}, + {maxUserAvg: 5, currentUserAvg: 4.9, currentRate: 0.5}: {r: 0.5, err: nil}, + {maxUserAvg: 5, currentUserAvg: 5, currentRate: 0.5}: {r: 0.5, err: nil}, + {maxUserAvg: 5, currentUserAvg: 5.1, currentRate: 0.5}: {r: 0.5, err: nil}, + {maxUserAvg: 5, currentUserAvg: 5.5, currentRate: 0.5}: {r: 0.5, err: nil}, + {maxUserAvg: 5, currentUserAvg: 6, currentRate: 0.5}: {r: 0.45833333333333337, err: nil}, + {maxUserAvg: 5, currentUserAvg: 7, currentRate: 0.5}: {r: 0.4285714285714286, err: nil}, + + // What happens when sampling at very high rate, and how do we converge on this + {maxUserAvg: 0.1, currentUserAvg: 1000000, currentRate: 1}: {r: 0.50000005, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 500000, currentRate: 0.50000005}: {r: 0.25000007500000504, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 250000, currentRate: 0.25000007500000504}: {r: 0.1250000875000175, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 125000, currentRate: 0.1250000875000175}: {r: 0.06250009375004376, err: nil}, + {maxUserAvg: 0.1, currentUserAvg: 65000, currentRate: 0.06250009375004376}: {r: 0.05, err: fmt.Errorf("raising pre-sampling rate from 3.1 %% to 5.0 %%")}, + {maxUserAvg: 0.1, currentUserAvg: 50000, currentRate: 0.05}: {r: 0.05, err: fmt.Errorf("raising pre-sampling rate from 2.5 %% to 5.0 %%")}, + + // not initialized yet, this is what happens at startup (no error, just default to 1) + {maxUserAvg: 0.1, currentUserAvg: 0, currentRate: 0}: {r: 1, err: nil}, + + // invalid input, those should really *NEVER* happen, test is just defensive + {maxUserAvg: 0, currentUserAvg: 0.1, currentRate: 0.1}: {r: 1, err: fmt.Errorf("inconsistent pre-sampling input maxUserAvg=0.000000 currentUserAvg=0.100000 currentRate=0.100000")}, + {maxUserAvg: 0.1, currentUserAvg: -0.02, currentRate: 0.1}: {r: 1, err: fmt.Errorf("inconsistent pre-sampling input maxUserAvg=0.100000 currentUserAvg=-0.020000 currentRate=0.100000")}, + {maxUserAvg: 0.1, currentUserAvg: 0.05, currentRate: -0.2}: {r: 1, err: fmt.Errorf("inconsistent pre-sampling input maxUserAvg=0.100000 currentUserAvg=0.050000 currentRate=-0.200000")}, + } + + for k, v := range expected { + r, err := CalcPreSampleRate(k.maxUserAvg, k.currentUserAvg, k.currentRate) + assert.Equal(v.r, r, "bad pre sample rate for maxUserAvg=%f currentUserAvg=%f, currentRate=%f, got %v, expected %v", k.maxUserAvg, k.currentUserAvg, k.currentRate, r, v.r) + if v.err == nil { + assert.Nil(err, "there should be no error for maxUserAvg=%f currentUserAvg=%f, currentRate=%f, got %v", k.maxUserAvg, k.currentUserAvg, k.currentRate, err) + } else { + assert.Equal(v.err, err, "unexpected error for maxUserAvg=%f currentUserAvg=%f, currentRate=%f, got %v, expected %v", k.maxUserAvg, k.currentUserAvg, k.currentRate, err, v.err) + } + } +} + +func TestPreSamplerRace(t *testing.T) { + var wg sync.WaitGroup + + const N = 1000 + ps := NewPreSampler() + wg.Add(5) + + go func() { + for i := 0; i < N; i++ { + ps.SetRate(0.5) + time.Sleep(time.Microsecond) + } + wg.Done() + }() + go func() { + for i := 0; i < N; i++ { + _ = ps.Rate() + time.Sleep(time.Microsecond) + } + wg.Done() + }() + go func() { + for i := 0; i < N; i++ { + _ = ps.RealRate() + time.Sleep(time.Microsecond) + } + wg.Done() + }() + go func() { + for i := 0; i < N; i++ { + _ = ps.sampleWithCount(42) + time.Sleep(time.Microsecond) + } + wg.Done() + }() + go func() { + for i := 0; i < N; i++ { + ps.decayScore() + time.Sleep(time.Microsecond) + } + wg.Done() + }() + wg.Wait() +} + +func TestPreSamplerSampleWithCount(t *testing.T) { + assert := assert.New(t) + + ps := NewPreSampler() + ps.SetRate(0.2) + assert.Equal(0.2, ps.RealRate(), "by default, RealRate returns wished rate") + assert.True(ps.sampleWithCount(100), "always accept first payload") + ps.decayScore() + assert.False(ps.sampleWithCount(10), "refuse as this accepting this would make 100%") + ps.decayScore() + assert.Equal(0.898876404494382, ps.RealRate()) + assert.False(ps.sampleWithCount(290), "still refuse") + ps.decayScore() + assert.False(ps.sampleWithCount(99), "just below the limit") + ps.decayScore() + assert.True(ps.sampleWithCount(1), "should there be no decay, this one would be dropped, but with decay, the rate decreased as the recently dropped gain importance over the old initially accepted") + ps.decayScore() + assert.Equal(0.16365162139216005, ps.RealRate(), "well below 20%, again, decay speaks") + assert.True(ps.sampleWithCount(1000000), "accepting payload with many traces") + ps.decayScore() + assert.Equal(0.9997119577953764, ps.RealRate(), "real rate is almost 1, as we accepted a hudge payload") + assert.False(ps.sampleWithCount(100000), "rejecting, real rate is too high now") + ps.decayScore() + assert.Equal(0.8986487877795845, ps.RealRate(), "real rate should be now around 90%") + assert.Equal(PreSamplerStats{ + Rate: 0.2, + RecentPayloadsSeen: 4.492300911839488, // seen more than this... but decay in action + RecentTracesSeen: 879284.5615616576, + RecentTracesDropped: 89116.55620097058, + }, ps.stats) +} + +func TestPreSamplerError(t *testing.T) { + assert := assert.New(t) + + ps := NewPreSampler() + assert.Equal("", ps.stats.Error, "fresh pre-sampler should have no error") + ps.SetError(fmt.Errorf("bad news")) + assert.Equal("bad news", ps.stats.Error, `error should be "bad news"`) + ps.SetError(nil) + assert.Equal("", ps.stats.Error, "after reset, error should be empty") +} diff --git a/pkg/trace/sampler/prioritysampler.go b/pkg/trace/sampler/prioritysampler.go new file mode 100644 index 0000000000000..b1940a166d427 --- /dev/null +++ b/pkg/trace/sampler/prioritysampler.go @@ -0,0 +1,146 @@ +// Package sampler contains all the logic of the agent-side trace sampling +// +// Currently implementation is based on the scoring of the "signature" of each trace +// Based on the score, we get a sample rate to apply to the given trace +// +// Current score implementation is super-simple, it is a counter with polynomial decay per signature. +// We increment it for each incoming trace then we periodically divide the score by two every X seconds. +// Right after the division, the score is an approximation of the number of received signatures over X seconds. +// It is different from the scoring in the Agent. +// +// Since the sampling can happen at different levels (client, agent, server) or depending on different rules, +// we have to track the sample rate applied at previous steps. This way, sampling twice at 50% can result in an +// effective 25% sampling. The rate is stored as a metric in the trace root. +package sampler + +import ( + "sync" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +const ( + // SamplingPriorityRateKey is the metrics key holding the sampling rate at which this trace + // was sampled. + SamplingPriorityRateKey = "_sampling_priority_rate_v1" + syncPeriod = 3 * time.Second +) + +// PriorityEngine is the main component of the sampling logic +type PriorityEngine struct { + // Sampler is the underlying sampler used by this engine, sharing logic among various engines. + Sampler *Sampler + + rateByService *RateByService + catalog *serviceKeyCatalog + exit chan struct{} +} + +// NewPriorityEngine returns an initialized Sampler +func NewPriorityEngine(extraRate float64, maxTPS float64, rateByService *RateByService) *PriorityEngine { + s := &PriorityEngine{ + Sampler: newSampler(extraRate, maxTPS), + rateByService: rateByService, + catalog: newServiceLookup(), + exit: make(chan struct{}), + } + + return s +} + +// Run runs and block on the Sampler main loop +func (s *PriorityEngine) Run() { + var wg sync.WaitGroup + wg.Add(2) + + go func() { + s.Sampler.Run() + wg.Done() + }() + + go func() { + t := time.NewTicker(syncPeriod) + defer t.Stop() + + for { + select { + case <-t.C: + s.rateByService.SetAll(s.ratesByService()) + case <-s.exit: + wg.Done() + return + } + } + }() + + wg.Wait() +} + +// Stop stops the main Run loop +func (s *PriorityEngine) Stop() { + s.Sampler.Stop() + close(s.exit) +} + +// Sample counts an incoming trace and returns the trace sampling decision and the applied sampling rate +func (s *PriorityEngine) Sample(trace pb.Trace, root *pb.Span, env string) (sampled bool, rate float64) { + // Extra safety, just in case one trace is empty + if len(trace) == 0 { + return false, 0 + } + + samplingPriority, _ := GetSamplingPriority(root) + + // Regardless of rates, sampling here is based on the metadata set + // by the client library. Which, is turn, is based on agent hints, + // but the rule of thumb is: respect client choice. + sampled = samplingPriority > 0 + + // Short-circuit and return without counting the trace in the sampling rate logic + // if its value has not been set automaticallt by the client lib. + // The feedback loop should be scoped to the values it can act upon. + if samplingPriority < 0 { + return sampled, 0 + } + if samplingPriority > 1 { + return sampled, 1 + } + + signature := s.catalog.register(ServiceSignature{root.Service, env}) + + // Update sampler state by counting this trace + s.Sampler.Backend.CountSignature(signature) + + // fetching applied sample rate + var ok bool + rate, ok = root.Metrics[SamplingPriorityRateKey] + if !ok { + rate = s.Sampler.GetSignatureSampleRate(signature) + root.Metrics[SamplingPriorityRateKey] = rate + } + + if sampled { + // Count the trace to allow us to check for the maxTPS limit. + // It has to happen before the maxTPS sampling. + s.Sampler.Backend.CountSample() + } + return sampled, rate +} + +// GetState collects and return internal statistics and coefficients for indication purposes +// It returns an interface{}, as other samplers might return other informations. +func (s *PriorityEngine) GetState() interface{} { + return s.Sampler.GetState() +} + +// ratesByService returns all rates by service, this information is useful for +// agents to pick the right service rate. +func (s *PriorityEngine) ratesByService() map[ServiceSignature]float64 { + return s.catalog.ratesByService(s.Sampler.GetAllSignatureSampleRates(), s.Sampler.GetDefaultSampleRate()) +} + +// GetType return the type of the sampler engine +func (s *PriorityEngine) GetType() EngineType { + return PriorityEngineType +} diff --git a/pkg/trace/sampler/prioritysampler_test.go b/pkg/trace/sampler/prioritysampler_test.go new file mode 100644 index 0000000000000..1b17c2409f32f --- /dev/null +++ b/pkg/trace/sampler/prioritysampler_test.go @@ -0,0 +1,218 @@ +package sampler + +import ( + "math" + "math/rand" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" + + "github.com/stretchr/testify/assert" +) + +const ( + testServiceA = "service-a" + testServiceB = "service-b" +) + +func getTestPriorityEngine() *PriorityEngine { + // Disable debug logs in these tests + log.UseLogger(log.Disabled) + + // No extra fixed sampling, no maximum TPS + extraRate := 1.0 + maxTPS := 0.0 + + rateByService := RateByService{} + return NewPriorityEngine(extraRate, maxTPS, &rateByService) +} + +func getTestTraceWithService(t *testing.T, service string, s *PriorityEngine) (pb.Trace, *pb.Span) { + tID := randomTraceID() + trace := pb.Trace{ + &pb.Span{TraceID: tID, SpanID: 1, ParentID: 0, Start: 42, Duration: 1000000, Service: service, Type: "web", Meta: map[string]string{"env": defaultEnv}}, + &pb.Span{TraceID: tID, SpanID: 2, ParentID: 1, Start: 100, Duration: 200000, Service: service, Type: "sql"}, + } + r := rand.Float64() + priority := PriorityAutoDrop + rates := s.ratesByService() + key := ServiceSignature{trace[0].Service, defaultEnv} + var rate float64 + if r, ok := rates[key]; ok { + rate = r + } else { + rate = 1 + } + if r <= rate { + priority = PriorityAutoKeep + } + SetSamplingPriority(trace[0], priority) + return trace, trace[0] +} + +func TestPrioritySample(t *testing.T) { + // Simple sample unit test + assert := assert.New(t) + + env := defaultEnv + + s := getTestPriorityEngine() + trace, root := getTestTraceWithService(t, "my-service", s) + + assert.Equal(0.0, s.Sampler.Backend.GetTotalScore(), "checking fresh backend total score is 0") + assert.Equal(0.0, s.Sampler.Backend.GetSampledScore(), "checkeing fresh backend sampled score is 0") + + s = getTestPriorityEngine() + trace, root = getTestTraceWithService(t, "my-service", s) + + SetSamplingPriority(root, -1) + sampled, rate := s.Sample(trace, root, env) + assert.False(sampled, "trace with negative priority is dropped") + assert.Equal(0.0, rate, "dropping all traces") + assert.Equal(0.0, s.Sampler.Backend.GetTotalScore(), "sampling a priority -1 trace should *NOT* impact sampler backend") + assert.Equal(0.0, s.Sampler.Backend.GetSampledScore(), "sampling a priority -1 trace should *NOT* impact sampler backend") + + s = getTestPriorityEngine() + trace, root = getTestTraceWithService(t, "my-service", s) + + SetSamplingPriority(root, 0) + sampled, _ = s.Sample(trace, root, env) + assert.False(sampled, "trace with priority 0 is dropped") + assert.True(0.0 < s.Sampler.Backend.GetTotalScore(), "sampling a priority 0 trace should increase total score") + assert.Equal(0.0, s.Sampler.Backend.GetSampledScore(), "sampling a priority 0 trace should *NOT* increase sampled score") + + s = getTestPriorityEngine() + trace, root = getTestTraceWithService(t, "my-service", s) + + SetSamplingPriority(root, 1) + sampled, _ = s.Sample(trace, root, env) + assert.True(sampled, "trace with priority 1 is kept") + assert.True(0.0 < s.Sampler.Backend.GetTotalScore(), "sampling a priority 0 trace should increase total score") + assert.True(0.0 < s.Sampler.Backend.GetSampledScore(), "sampling a priority 0 trace should increase sampled score") + + s = getTestPriorityEngine() + trace, root = getTestTraceWithService(t, "my-service", s) + + SetSamplingPriority(root, 2) + sampled, rate = s.Sample(trace, root, env) + assert.True(sampled, "trace with priority 2 is kept") + assert.Equal(1.0, rate, "sampling all traces") + assert.Equal(0.0, s.Sampler.Backend.GetTotalScore(), "sampling a priority 2 trace should *NOT* increase total score") + assert.Equal(0.0, s.Sampler.Backend.GetSampledScore(), "sampling a priority 2 trace should *NOT* increase sampled score") + + s = getTestPriorityEngine() + trace, root = getTestTraceWithService(t, "my-service", s) + + SetSamplingPriority(root, PriorityUserKeep) + sampled, rate = s.Sample(trace, root, env) + assert.True(sampled, "trace with high priority is kept") + assert.Equal(1.0, rate, "sampling all traces") + assert.Equal(0.0, s.Sampler.Backend.GetTotalScore(), "sampling a high priority trace should *NOT* increase total score") + assert.Equal(0.0, s.Sampler.Backend.GetSampledScore(), "sampling a high priority trace should *NOT* increase sampled score") + + delete(root.Metrics, KeySamplingPriority) + sampled, _ = s.Sample(trace, root, env) + assert.False(sampled, "this should not happen but a trace without priority sampling set should be dropped") +} + +func TestPrioritySampleTracerWeight(t *testing.T) { + // Simple sample unit test + assert := assert.New(t) + env := defaultEnv + + s := getTestPriorityEngine() + clientRate := 0.33 + for i := 0; i < 10; i++ { + trace, root := getTestTraceWithService(t, "my-service", s) + SetSamplingPriority(root, SamplingPriority(i%2)) + root.Metrics[SamplingPriorityRateKey] = clientRate + _, rate := s.Sample(trace, root, env) + assert.Equal(clientRate, rate) + } +} + +func TestMaxTPSByService(t *testing.T) { + rand.Seed(1) + // Test the "effectiveness" of the maxTPS option. + assert := assert.New(t) + s := getTestPriorityEngine() + + type testCase struct { + maxTPS float64 + tps float64 + relativeError float64 + } + testCases := []testCase{ + {maxTPS: 10.0, tps: 20.0, relativeError: 0.2}, + } + if !testing.Short() { + testCases = append(testCases, + testCase{maxTPS: 5.0, tps: 50.0, relativeError: 0.2}, + testCase{maxTPS: 3.0, tps: 200.0, relativeError: 0.2}, + testCase{maxTPS: 1.0, tps: 1000.0, relativeError: 0.2}, + testCase{maxTPS: 10.0, tps: 10.0, relativeError: 0.001}, + testCase{maxTPS: 10.0, tps: 3.0, relativeError: 0.001}) + } + + // To avoid the edge effects from an non-initialized sampler, wait a bit before counting samples. + const ( + initPeriods = 50 + periods = 500 + ) + + for _, tc := range testCases { + t.Logf("testing maxTPS=%0.1f tps=%0.1f", tc.maxTPS, tc.tps) + s.Sampler.maxTPS = tc.maxTPS + periodSeconds := defaultDecayPeriod.Seconds() + tracesPerPeriod := tc.tps * periodSeconds + // Set signature score offset high enough not to kick in during the test. + s.Sampler.signatureScoreOffset.Store(2 * tc.tps) + s.Sampler.signatureScoreFactor.Store(math.Pow(s.Sampler.signatureScoreSlope.Load(), math.Log10(s.Sampler.signatureScoreOffset.Load()))) + + sampledCount := 0 + handledCount := 0 + + for period := 0; period < initPeriods+periods; period++ { + s.Sampler.Backend.(*MemoryBackend).decayScore() + s.Sampler.AdjustScoring() + for i := 0; i < int(tracesPerPeriod); i++ { + trace, root := getTestTraceWithService(t, "service-a", s) + sampled, _ := s.Sample(trace, root, defaultEnv) + // Once we got into the "supposed-to-be" stable "regime", count the samples + if period > initPeriods { + handledCount++ + if sampled { + sampledCount++ + } + } + } + } + + // When tps is lower than maxTPS it means that we are actually not sampling + // anything, so the target is the original tps, and not maxTPS. + // Also, in that case, results should be more precise. + targetTPS := tc.maxTPS + relativeError := 0.01 + if tc.maxTPS > tc.tps { + targetTPS = tc.tps + } else { + relativeError = 0.1 + defaultDecayFactor - 1 + } + + // Check that the sampled score is roughly equal to maxTPS. This is different from + // the score sampler test as here we run adjustscoring on a regular basis so the converges to maxTPS. + assert.InEpsilon(targetTPS, s.Sampler.Backend.GetSampledScore(), relativeError) + + // We should have keep the right percentage of traces + assert.InEpsilon(targetTPS/tc.tps, float64(sampledCount)/float64(handledCount), relativeError) + + // We should have a throughput of sampled traces around maxTPS + // Check for 1% epsilon, but the precision also depends on the backend imprecision (error factor = decayFactor). + // Combine error rates with L1-norm instead of L2-norm by laziness, still good enough for tests. + assert.InEpsilon(targetTPS, float64(sampledCount)/(float64(periods)*periodSeconds), relativeError) + } +} + +// Ensure PriorityEngine implements engine. +var testPriorityEngine Engine = &PriorityEngine{} diff --git a/pkg/trace/sampler/sampler.go b/pkg/trace/sampler/sampler.go new file mode 100644 index 0000000000000..339c4b19803d9 --- /dev/null +++ b/pkg/trace/sampler/sampler.go @@ -0,0 +1,189 @@ +// Package sampler contains all the logic of the agent-side trace sampling +// +// Currently implementation is based on the scoring of the "signature" of each trace +// Based on the score, we get a sample rate to apply to the given trace +// +// Current score implementation is super-simple, it is a counter with polynomial decay per signature. +// We increment it for each incoming trace then we periodically divide the score by two every X seconds. +// Right after the division, the score is an approximation of the number of received signatures over X seconds. +// It is different from the scoring in the Agent. +// +// Since the sampling can happen at different levels (client, agent, server) or depending on different rules, +// we have to track the sample rate applied at previous steps. This way, sampling twice at 50% can result in an +// effective 25% sampling. The rate is stored as a metric in the trace root. +package sampler + +import ( + "math" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +const ( + // KeySamplingRateGlobal is a metric key holding the global sampling rate. + KeySamplingRateGlobal = "_sample_rate" + + // KeySamplingRateClient is a metric key holding the client-set sampling rate for APM events. + KeySamplingRateClient = "_dd1.sr.rcusr" + + // KeySamplingRatePreSampler is a metric key holding the pre-sampler rate for APM events. + KeySamplingRatePreSampler = "_dd1.sr.rapre" + + // KeySamplingRateEventExtraction is the key of the metric storing the event extraction rate on an APM event. + KeySamplingRateEventExtraction = "_dd1.sr.eausr" + + // KeySamplingRateMaxEPSSampler is the key of the metric storing the max eps sampler rate on an APM event. + KeySamplingRateMaxEPSSampler = "_dd1.sr.eamax" + + // KeySamplingPriority is the key of the sampling priority value in the metrics map of the root span + KeySamplingPriority = "_sampling_priority_v1" +) + +// SamplingPriority is the type encoding a priority sampling decision. +type SamplingPriority int8 + +const ( + // PriorityNone is the value for SamplingPriority when no priority sampling decision could be found. + PriorityNone SamplingPriority = math.MinInt8 + + // PriorityUserDrop is the value set by a user to explicitly drop a trace. + PriorityUserDrop SamplingPriority = -1 + + // PriorityAutoDrop is the value set by a tracer to suggest dropping a trace. + PriorityAutoDrop SamplingPriority = 0 + + // PriorityAutoKeep is the value set by a tracer to suggest keeping a trace. + PriorityAutoKeep SamplingPriority = 1 + + // PriorityUserKeep is the value set by a user to explicitly keep a trace. + PriorityUserKeep SamplingPriority = 2 +) + +// Weight returns the weight of the span as defined for sampling, i.e. the +// inverse of the sampling rate. +func Weight(s *pb.Span) float64 { + if s == nil { + return 1.0 + } + sampleRate, ok := s.Metrics[KeySamplingRateGlobal] + if !ok || sampleRate <= 0.0 || sampleRate > 1.0 { + return 1.0 + } + + return 1.0 / sampleRate +} + +// GetSamplingPriority returns the value of the sampling priority metric set on this span and a boolean indicating if +// such a metric was actually found or not. +func GetSamplingPriority(s *pb.Span) (SamplingPriority, bool) { + p, ok := getMetric(s, KeySamplingPriority) + return SamplingPriority(p), ok +} + +// SetSamplingPriority sets the sampling priority value on this span, overwriting any previously set value. +func SetSamplingPriority(s *pb.Span, priority SamplingPriority) { + setMetric(s, KeySamplingPriority, float64(priority)) +} + +// GetGlobalRate gets the cumulative sample rate of the trace to which this span belongs to. +func GetGlobalRate(s *pb.Span) float64 { + return getMetricDefault(s, KeySamplingRateGlobal, 1.0) +} + +// SetGlobalRate sets the cumulative sample rate of the trace to which this span belongs to. +func SetGlobalRate(s *pb.Span, rate float64) { + setMetric(s, KeySamplingRateGlobal, rate) +} + +// AddGlobalRate updates the cumulative sample rate of the trace to which this span belongs to with the provided +// rate which is assumed to belong to an independent sampler. The combination is done by simple multiplications. +func AddGlobalRate(s *pb.Span, rate float64) { + setMetric(s, KeySamplingRateGlobal, GetGlobalRate(s)*rate) +} + +// GetClientRate gets the rate at which the trace this span belongs to was sampled by the tracer. +// NOTE: This defaults to 1 if no rate is stored. +func GetClientRate(s *pb.Span) float64 { + return getMetricDefault(s, KeySamplingRateClient, 1.0) +} + +// SetClientRate sets the rate at which the trace this span belongs to was sampled by the tracer. +func SetClientRate(s *pb.Span, rate float64) { + if rate < 1 { + setMetric(s, KeySamplingRateClient, rate) + } else { + // We assume missing value is 1 to save bandwidth (check getter). + delete(s.Metrics, KeySamplingRateClient) + } +} + +// GetPreSampleRate returns the rate at which the trace this span belongs to was sampled by the agent's presampler. +// NOTE: This defaults to 1 if no rate is stored. +func GetPreSampleRate(s *pb.Span) float64 { + return getMetricDefault(s, KeySamplingRatePreSampler, 1.0) +} + +// SetPreSampleRate sets the rate at which the trace this span belongs to was sampled by the agent's presampler. +func SetPreSampleRate(s *pb.Span, rate float64) { + if rate < 1 { + setMetric(s, KeySamplingRatePreSampler, rate) + } else { + // We assume missing value is 1 to save bandwidth (check getter). + delete(s.Metrics, KeySamplingRatePreSampler) + } +} + +// GetEventExtractionRate gets the rate at which the trace from which we extracted this event was sampled at the tracer. +// This defaults to 1 if no rate is stored. +func GetEventExtractionRate(s *pb.Span) float64 { + return getMetricDefault(s, KeySamplingRateEventExtraction, 1.0) +} + +// SetEventExtractionRate sets the rate at which the trace from which we extracted this event was sampled at the tracer. +func SetEventExtractionRate(s *pb.Span, rate float64) { + if rate < 1 { + setMetric(s, KeySamplingRateEventExtraction, rate) + } else { + // reduce bandwidth, default is assumed 1.0 in backend + delete(s.Metrics, KeySamplingRateEventExtraction) + } +} + +// GetMaxEPSRate gets the rate at which this event was sampled by the max eps event sampler. +func GetMaxEPSRate(s *pb.Span) float64 { + return getMetricDefault(s, KeySamplingRateMaxEPSSampler, 1.0) +} + +// SetMaxEPSRate sets the rate at which this event was sampled by the max eps event sampler. +func SetMaxEPSRate(s *pb.Span, rate float64) { + if rate < 1 { + setMetric(s, KeySamplingRateMaxEPSSampler, rate) + } else { + // reduce bandwidth, default is assumed 1.0 in backend + delete(s.Metrics, KeySamplingRateMaxEPSSampler) + } +} + +func getMetric(s *pb.Span, k string) (float64, bool) { + if s.Metrics == nil { + return 0, false + } + val, ok := s.Metrics[k] + return val, ok +} + +// getMetricDefault gets a value in the span Metrics map or default if no value is stored there. +func getMetricDefault(s *pb.Span, k string, def float64) float64 { + if val, ok := getMetric(s, k); ok { + return val + } + return def +} + +// setMetric sets a value in the span Metrics map. +func setMetric(s *pb.Span, key string, val float64) { + if s.Metrics == nil { + s.Metrics = make(map[string]float64) + } + s.Metrics[key] = val +} diff --git a/pkg/trace/sampler/sampler_test.go b/pkg/trace/sampler/sampler_test.go new file mode 100644 index 0000000000000..064a535656ec7 --- /dev/null +++ b/pkg/trace/sampler/sampler_test.go @@ -0,0 +1,65 @@ +package sampler + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func testSpan() *pb.Span { + return &pb.Span{ + Duration: 10000000, + Error: 0, + Resource: "GET /some/raclette", + Service: "django", + Name: "django.controller", + SpanID: 42, + Start: 1448466874000000000, + TraceID: 424242, + Meta: map[string]string{ + "user": "leo", + "pool": "fondue", + }, + Metrics: map[string]float64{ + "cheese_weight": 100000.0, + }, + ParentID: 1111, + Type: "http", + } +} + +func TestSpanString(t *testing.T) { + assert := assert.New(t) + assert.NotEqual("", testSpan().String()) +} + +func TestSpanWeight(t *testing.T) { + assert := assert.New(t) + + span := testSpan() + assert.Equal(1.0, Weight(span)) + + span.Metrics[KeySamplingRateGlobal] = -1.0 + assert.Equal(1.0, Weight(span)) + + span.Metrics[KeySamplingRateGlobal] = 0.0 + assert.Equal(1.0, Weight(span)) + + span.Metrics[KeySamplingRateGlobal] = 0.25 + assert.Equal(4.0, Weight(span)) + + span.Metrics[KeySamplingRateGlobal] = 1.0 + assert.Equal(1.0, Weight(span)) + + span.Metrics[KeySamplingRateGlobal] = 1.5 + assert.Equal(1.0, Weight(span)) +} + +func TestSpanWeightNil(t *testing.T) { + assert := assert.New(t) + + var span *pb.Span + + assert.Equal(1.0, Weight(span), "Weight should be callable on nil and return a default value") +} diff --git a/pkg/trace/sampler/score.go b/pkg/trace/sampler/score.go new file mode 100644 index 0000000000000..47a12faab43dc --- /dev/null +++ b/pkg/trace/sampler/score.go @@ -0,0 +1,79 @@ +package sampler + +import ( + "math" +) + +const ( + // 2^64 - 1 + maxTraceID = ^uint64(0) + maxTraceIDFloat = float64(maxTraceID) + // Good number for Knuth hashing (large, prime, fit in int64 for languages without uint64) + samplerHasher = uint64(1111111111111111111) +) + +// SampleByRate tells if a trace (from its ID) with a given rate should be sampled +// Use Knuth multiplicative hashing to leverage imbalanced traceID generators +func SampleByRate(traceID uint64, rate float64) bool { + if rate < 1 { + return traceID*samplerHasher < uint64(rate*maxTraceIDFloat) + } + return true +} + +func capTo1(f float64) float64 { + if f > 1 { + return 1 + } + return f +} + +// GetSignatureSampleRate gives the sample rate to apply to any signature. +// For now, only based on count score. +func (s *Sampler) GetSignatureSampleRate(signature Signature) float64 { + return capTo1(s.GetCountScore(signature)) +} + +// GetAllSignatureSampleRates gives the sample rate to apply to all signatures. +// For now, only based on count score. +func (s *Sampler) GetAllSignatureSampleRates() map[Signature]float64 { + m := s.GetAllCountScores() + for k, v := range m { + m[k] = capTo1(v) + } + return m +} + +// GetDefaultSampleRate gives the sample rate to apply to an unknown signature. +// For now, only based on count score. +func (s *Sampler) GetDefaultSampleRate() float64 { + return capTo1(s.GetDefaultCountScore()) +} + +func (s *Sampler) backendScoreToSamplerScore(score float64) float64 { + return s.signatureScoreFactor.Load() / math.Pow(s.signatureScoreSlope.Load(), math.Log10(score)) +} + +// GetCountScore scores any signature based on its recent throughput +// The score value can be seeing as the sample rate if the count were the only factor +// Since other factors can intervene (such as extra global sampling), its value can be larger than 1 +func (s *Sampler) GetCountScore(signature Signature) float64 { + return s.backendScoreToSamplerScore(s.Backend.GetSignatureScore(signature)) +} + +// GetAllCountScores scores all signatures based on their recent throughput +// The score value can be seeing as the sample rate if the count were the only factor +// Since other factors can intervene (such as extra global sampling), its value can be larger than 1 +func (s *Sampler) GetAllCountScores() map[Signature]float64 { + m := s.Backend.GetSignatureScores() + for k, v := range m { + m[k] = s.backendScoreToSamplerScore(v) + } + return m +} + +// GetDefaultCountScore returns a default score when not knowing the signature for real. +// Since other factors can intervene (such as extra global sampling), its value can be larger than 1 +func (s *Sampler) GetDefaultCountScore() float64 { + return s.backendScoreToSamplerScore(s.Backend.GetTotalScore()) +} diff --git a/pkg/trace/sampler/score_test.go b/pkg/trace/sampler/score_test.go new file mode 100644 index 0000000000000..deb3212468e43 --- /dev/null +++ b/pkg/trace/sampler/score_test.go @@ -0,0 +1,44 @@ +package sampler + +import ( + "math/rand" + "testing" + + "github.com/stretchr/testify/assert" +) + +func randomTraceID() uint64 { + return uint64(rand.Int63()) +} + +func TestTrivialSampleByRate(t *testing.T) { + assert := assert.New(t) + + assert.False(SampleByRate(randomTraceID(), 0)) + assert.True(SampleByRate(randomTraceID(), 1)) +} + +func TestSampleRateManyTraces(t *testing.T) { + // Test that the effective sample rate isn't far from the theoretical + // Test with multiple sample rates + assert := assert.New(t) + + times := 1000000 + + for _, rate := range []float64{1.0, 0.1, 0.5, 0.99} { + sampled := 0 + for i := 0; i < times; i++ { + if SampleByRate(randomTraceID(), rate) { + sampled++ + } + } + assert.InEpsilon(float64(sampled), float64(times)*rate, 0.01) + } +} + +func BenchmarkBackendScoreToSamplerScore(b *testing.B) { + s := newSampler(1.0, 10) + for i := 0; i < b.N; i++ { + s.backendScoreToSamplerScore(10) + } +} diff --git a/pkg/trace/sampler/scoresampler.go b/pkg/trace/sampler/scoresampler.go new file mode 100644 index 0000000000000..4d03330cda839 --- /dev/null +++ b/pkg/trace/sampler/scoresampler.go @@ -0,0 +1,92 @@ +package sampler + +import "github.com/DataDog/datadog-agent/pkg/trace/pb" + +// ScoreEngine is the main component of the sampling logic +type ScoreEngine struct { + // Sampler is the underlying sampler used by this engine, sharing logic among various engines. + Sampler *Sampler + engineType EngineType +} + +// NewScoreEngine returns an initialized Sampler +func NewScoreEngine(extraRate float64, maxTPS float64) *ScoreEngine { + s := &ScoreEngine{ + Sampler: newSampler(extraRate, maxTPS), + engineType: NormalScoreEngineType, + } + + return s +} + +// NewErrorsEngine returns an initialized Sampler dedicate to errors. It behaves +// just like the the normal ScoreEngine except for its GetType method (useful +// for reporting). +func NewErrorsEngine(extraRate float64, maxTPS float64) *ScoreEngine { + s := &ScoreEngine{ + Sampler: newSampler(extraRate, maxTPS), + engineType: ErrorsScoreEngineType, + } + + return s +} + +// Run runs and block on the Sampler main loop +func (s *ScoreEngine) Run() { + s.Sampler.Run() +} + +// Stop stops the main Run loop +func (s *ScoreEngine) Stop() { + s.Sampler.Stop() +} + +func applySampleRate(root *pb.Span, rate float64) bool { + initialRate := GetGlobalRate(root) + newRate := initialRate * rate + traceID := root.TraceID + return SampleByRate(traceID, newRate) +} + +// Sample counts an incoming trace and tells if it is a sample which has to be kept +func (s *ScoreEngine) Sample(trace pb.Trace, root *pb.Span, env string) (sampled bool, rate float64) { + // Extra safety, just in case one trace is empty + if len(trace) == 0 { + return false, 0 + } + + signature := computeSignatureWithRootAndEnv(trace, root, env) + + // Update sampler state by counting this trace + s.Sampler.Backend.CountSignature(signature) + + rate = s.Sampler.GetSampleRate(trace, root, signature) + + sampled = applySampleRate(root, rate) + + if sampled { + // Count the trace to allow us to check for the maxTPS limit. + // It has to happen before the maxTPS sampling. + s.Sampler.Backend.CountSample() + + // Check for the maxTPS limit, and if we require an extra sampling. + // No need to check if we already decided not to keep the trace. + maxTPSrate := s.Sampler.GetMaxTPSSampleRate() + if maxTPSrate < 1 { + sampled = applySampleRate(root, maxTPSrate) + } + } + + return sampled, rate +} + +// GetState collects and return internal statistics and coefficients for indication purposes +// It returns an interface{}, as other samplers might return other informations. +func (s *ScoreEngine) GetState() interface{} { + return s.Sampler.GetState() +} + +// GetType returns the type of the sampler +func (s *ScoreEngine) GetType() EngineType { + return s.engineType +} diff --git a/pkg/trace/sampler/scoresampler_test.go b/pkg/trace/sampler/scoresampler_test.go new file mode 100644 index 0000000000000..2bf2bf054143a --- /dev/null +++ b/pkg/trace/sampler/scoresampler_test.go @@ -0,0 +1,124 @@ +package sampler + +import ( + "math" + "math/rand" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" + "github.com/stretchr/testify/assert" +) + +const defaultEnv = "none" + +func getTestScoreEngine() *ScoreEngine { + // Disable debug logs in these tests + log.UseLogger(log.Disabled) + + // No extra fixed sampling, no maximum TPS + extraRate := 1.0 + maxTPS := 0.0 + + return NewScoreEngine(extraRate, maxTPS) +} + +func getTestTrace() (pb.Trace, *pb.Span) { + tID := randomTraceID() + trace := pb.Trace{ + &pb.Span{TraceID: tID, SpanID: 1, ParentID: 0, Start: 42, Duration: 1000000, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: tID, SpanID: 2, ParentID: 1, Start: 100, Duration: 200000, Service: "mcnulty", Type: "sql"}, + } + return trace, trace[0] +} + +func TestExtraSampleRate(t *testing.T) { + assert := assert.New(t) + + s := getTestScoreEngine() + trace, root := getTestTrace() + signature := testComputeSignature(trace) + + // Feed the s with a signature so that it has a < 1 sample rate + for i := 0; i < int(1e6); i++ { + s.Sample(trace, root, defaultEnv) + } + + sRate := s.Sampler.GetSampleRate(trace, root, signature) + + // Then turn on the extra sample rate, then ensure it affects both existing and new signatures + s.Sampler.extraRate = 0.33 + + assert.Equal(s.Sampler.GetSampleRate(trace, root, signature), s.Sampler.extraRate*sRate) +} + +func TestMaxTPS(t *testing.T) { + // Test the "effectiveness" of the maxTPS option. + assert := assert.New(t) + s := getTestScoreEngine() + + maxTPS := 5.0 + tps := 100.0 + // To avoid the edge effects from an non-initialized sampler, wait a bit before counting samples. + initPeriods := 20 + periods := 50 + + s.Sampler.maxTPS = maxTPS + periodSeconds := defaultDecayPeriod.Seconds() + tracesPerPeriod := tps * periodSeconds + // Set signature score offset high enough not to kick in during the test. + s.Sampler.signatureScoreOffset.Store(2 * tps) + s.Sampler.signatureScoreFactor.Store(math.Pow(s.Sampler.signatureScoreSlope.Load(), math.Log10(s.Sampler.signatureScoreOffset.Load()))) + + sampledCount := 0 + + for period := 0; period < initPeriods+periods; period++ { + s.Sampler.Backend.(*MemoryBackend).decayScore() + for i := 0; i < int(tracesPerPeriod); i++ { + trace, root := getTestTrace() + sampled, _ := s.Sample(trace, root, defaultEnv) + // Once we got into the "supposed-to-be" stable "regime", count the samples + if period > initPeriods && sampled { + sampledCount++ + } + } + } + + // Check that the sampled score pre-maxTPS is equals to the incoming number of traces per second + assert.InEpsilon(tps, s.Sampler.Backend.GetSampledScore(), 0.01) + + // We should have kept less traces per second than maxTPS + assert.True(s.Sampler.maxTPS >= float64(sampledCount)/(float64(periods)*periodSeconds)) + + // We should have a throughput of sampled traces around maxTPS + // Check for 1% epsilon, but the precision also depends on the backend imprecision (error factor = decayFactor). + // Combine error rates with L1-norm instead of L2-norm by laziness, still good enough for tests. + assert.InEpsilon(s.Sampler.maxTPS, float64(sampledCount)/(float64(periods)*periodSeconds), + 0.01+defaultDecayFactor-1) +} + +func BenchmarkSampler(b *testing.B) { + // Benchmark the resource consumption of many traces sampling + + // Up to signatureCount different signatures + signatureCount := 20 + + s := getTestScoreEngine() + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + trace := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Start: 42, Duration: 1000000000, Service: "mcnulty", Type: "web", Resource: string(rand.Intn(signatureCount))}, + &pb.Span{TraceID: 1, SpanID: 2, ParentID: 1, Start: 100, Duration: 200000000, Service: "mcnulty", Type: "sql"}, + &pb.Span{TraceID: 1, SpanID: 3, ParentID: 2, Start: 150, Duration: 199999000, Service: "master-db", Type: "sql"}, + &pb.Span{TraceID: 1, SpanID: 4, ParentID: 1, Start: 500000000, Duration: 500000, Service: "redis", Type: "redis"}, + &pb.Span{TraceID: 1, SpanID: 5, ParentID: 1, Start: 700000000, Duration: 700000, Service: "mcnulty", Type: ""}, + } + s.Sample(trace, trace[0], defaultEnv) + } +} + +// Ensure ScoreEngine implements engine. +var testScoreEngine Engine = &ScoreEngine{} diff --git a/pkg/trace/sampler/signature.go b/pkg/trace/sampler/signature.go new file mode 100644 index 0000000000000..64aa648c3abd7 --- /dev/null +++ b/pkg/trace/sampler/signature.go @@ -0,0 +1,88 @@ +package sampler + +import ( + "hash/fnv" + "sort" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// Signature is a hash representation of trace or a service, used to identify +// simlar signatures. +type Signature uint64 + +// spanHash is the type of the hashes used during the computation of a signature +// Use FNV for hashing since it is super-cheap and we have no cryptographic needs +type spanHash uint32 +type spanHashSlice []spanHash + +func (p spanHashSlice) Len() int { return len(p) } +func (p spanHashSlice) Less(i, j int) bool { return p[i] < p[j] } +func (p spanHashSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } +func sortHashes(hashes []spanHash) { sort.Sort(spanHashSlice(hashes)) } + +// computeSignatureWithRootAndEnv generates the signature of a trace knowing its root +// Signature based on the hash of (env, service, name, resource, is_error) for the root, plus the set of +// (env, service, name, is_error) of each span. +func computeSignatureWithRootAndEnv(trace pb.Trace, root *pb.Span, env string) Signature { + rootHash := computeRootHash(*root, env) + spanHashes := make([]spanHash, 0, len(trace)) + + for i := range trace { + spanHashes = append(spanHashes, computeSpanHash(trace[i], env)) + } + + // Now sort, dedupe then merge all the hashes to build the signature + sortHashes(spanHashes) + + last := spanHashes[0] + traceHash := last ^ rootHash + for i := 1; i < len(spanHashes); i++ { + if spanHashes[i] != last { + last = spanHashes[i] + traceHash = spanHashes[i] ^ traceHash + } + } + + return Signature(traceHash) +} + +// ServiceSignature represents a unique way to identify a service. +type ServiceSignature struct{ Name, Env string } + +// Hash generates the signature of a trace with minimal information such as +// service and env, this is typically used by distributed sampling based on +// priority, and used as a key to store the desired rate for a given +// service,env tuple. +func (s ServiceSignature) Hash() Signature { + h := fnv.New32a() + h.Write([]byte(s.Name)) + h.Write([]byte{','}) + h.Write([]byte(s.Env)) + return Signature(h.Sum32()) +} + +func (s ServiceSignature) String() string { + return "service:" + s.Name + ",env:" + s.Env +} + +func computeSpanHash(span *pb.Span, env string) spanHash { + h := fnv.New32a() + h.Write([]byte(env)) + h.Write([]byte(span.Service)) + h.Write([]byte(span.Name)) + h.Write([]byte{byte(span.Error)}) + + return spanHash(h.Sum32()) +} + +func computeRootHash(span pb.Span, env string) spanHash { + h := fnv.New32a() + h.Write([]byte(env)) + h.Write([]byte(span.Service)) + h.Write([]byte(span.Name)) + h.Write([]byte(span.Resource)) + h.Write([]byte{byte(span.Error)}) + + return spanHash(h.Sum32()) +} diff --git a/pkg/trace/sampler/signature_test.go b/pkg/trace/sampler/signature_test.go new file mode 100644 index 0000000000000..e92696f829279 --- /dev/null +++ b/pkg/trace/sampler/signature_test.go @@ -0,0 +1,128 @@ +package sampler + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/stretchr/testify/assert" +) + +func testComputeSignature(trace pb.Trace) Signature { + root := traceutil.GetRoot(trace) + env := traceutil.GetEnv(trace) + return computeSignatureWithRootAndEnv(trace, root, env) +} + +func TestSignatureSimilar(t *testing.T) { + assert := assert.New(t) + + t1 := pb.Trace{ + &pb.Span{TraceID: 101, SpanID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 26965}, + &pb.Span{TraceID: 101, SpanID: 1012, ParentID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 197884}, + &pb.Span{TraceID: 101, SpanID: 1013, ParentID: 1012, Service: "x1", Name: "y1", Resource: "z1", Duration: 12304982304}, + &pb.Span{TraceID: 101, SpanID: 1014, ParentID: 1013, Service: "x2", Name: "y2", Resource: "z2", Duration: 34384993}, + } + t2 := pb.Trace{ + &pb.Span{TraceID: 102, SpanID: 1021, Service: "x1", Name: "y1", Resource: "z1", Duration: 992312}, + &pb.Span{TraceID: 102, SpanID: 1022, ParentID: 1021, Service: "x1", Name: "y1", Resource: "z1", Duration: 34347}, + &pb.Span{TraceID: 102, SpanID: 1023, ParentID: 1022, Service: "x2", Name: "y2", Resource: "z2", Duration: 349944}, + } + + assert.Equal(testComputeSignature(t1), testComputeSignature(t2)) +} + +func TestSignatureDifferentError(t *testing.T) { + assert := assert.New(t) + + t1 := pb.Trace{ + &pb.Span{TraceID: 101, SpanID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 26965}, + &pb.Span{TraceID: 101, SpanID: 1012, ParentID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 197884}, + &pb.Span{TraceID: 101, SpanID: 1013, ParentID: 1012, Service: "x1", Name: "y1", Resource: "z1", Duration: 12304982304}, + &pb.Span{TraceID: 101, SpanID: 1014, ParentID: 1013, Service: "x2", Name: "y2", Resource: "z2", Duration: 34384993}, + } + t2 := pb.Trace{ + &pb.Span{TraceID: 110, SpanID: 1101, Service: "x1", Name: "y1", Resource: "z1", Duration: 992312}, + &pb.Span{TraceID: 110, SpanID: 1102, ParentID: 1101, Service: "x1", Name: "y1", Resource: "z1", Error: 1, Duration: 34347}, + &pb.Span{TraceID: 110, SpanID: 1103, ParentID: 1101, Service: "x2", Name: "y2", Resource: "z2", Duration: 349944}, + } + + assert.NotEqual(testComputeSignature(t1), testComputeSignature(t2)) +} + +func TestSignatureDifferentRoot(t *testing.T) { + assert := assert.New(t) + + t1 := pb.Trace{ + &pb.Span{TraceID: 101, SpanID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 26965}, + &pb.Span{TraceID: 101, SpanID: 1012, ParentID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 197884}, + &pb.Span{TraceID: 101, SpanID: 1013, ParentID: 1012, Service: "x1", Name: "y1", Resource: "z1", Duration: 12304982304}, + &pb.Span{TraceID: 101, SpanID: 1014, ParentID: 1013, Service: "x2", Name: "y2", Resource: "z2", Duration: 34384993}, + } + t2 := pb.Trace{ + &pb.Span{TraceID: 103, SpanID: 1031, Service: "x1", Name: "y1", Resource: "z2", Duration: 19207}, + &pb.Span{TraceID: 103, SpanID: 1032, ParentID: 1031, Service: "x1", Name: "y1", Resource: "z1", Duration: 234923874}, + &pb.Span{TraceID: 103, SpanID: 1033, ParentID: 1032, Service: "x1", Name: "y1", Resource: "z1", Duration: 152342344}, + } + + assert.NotEqual(testComputeSignature(t1), testComputeSignature(t2)) +} + +func testComputeServiceSignature(trace pb.Trace) Signature { + root := traceutil.GetRoot(trace) + env := traceutil.GetEnv(trace) + return ServiceSignature{root.Service, env}.Hash() +} + +func TestServiceSignatureSimilar(t *testing.T) { + assert := assert.New(t) + + t1 := pb.Trace{ + &pb.Span{TraceID: 101, SpanID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 26965}, + &pb.Span{TraceID: 101, SpanID: 1012, ParentID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 197884}, + &pb.Span{TraceID: 101, SpanID: 1013, ParentID: 1012, Service: "x1", Name: "y1", Resource: "z1", Duration: 12304982304}, + &pb.Span{TraceID: 101, SpanID: 1014, ParentID: 1013, Service: "x2", Name: "y2", Resource: "z2", Duration: 34384993}, + } + t2 := pb.Trace{ + &pb.Span{TraceID: 102, SpanID: 1021, Service: "x1", Name: "y2", Resource: "z2", Duration: 992312}, + &pb.Span{TraceID: 102, SpanID: 1022, ParentID: 1021, Service: "x1", Name: "y1", Resource: "z1", Error: 1, Duration: 34347}, + &pb.Span{TraceID: 102, SpanID: 1023, ParentID: 1022, Service: "x2", Name: "y2", Resource: "z2", Duration: 349944}, + } + assert.Equal(testComputeServiceSignature(t1), testComputeServiceSignature(t2)) +} + +func TestServiceSignatureDifferentService(t *testing.T) { + assert := assert.New(t) + + t1 := pb.Trace{ + &pb.Span{TraceID: 101, SpanID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 26965}, + &pb.Span{TraceID: 101, SpanID: 1012, ParentID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 197884}, + &pb.Span{TraceID: 101, SpanID: 1013, ParentID: 1012, Service: "x1", Name: "y1", Resource: "z1", Duration: 12304982304}, + &pb.Span{TraceID: 101, SpanID: 1014, ParentID: 1013, Service: "x2", Name: "y2", Resource: "z2", Duration: 34384993}, + } + t2 := pb.Trace{ + &pb.Span{TraceID: 103, SpanID: 1031, Service: "x2", Name: "y1", Resource: "z1", Duration: 19207}, + &pb.Span{TraceID: 103, SpanID: 1032, ParentID: 1031, Service: "x1", Name: "y1", Resource: "z1", Duration: 234923874}, + &pb.Span{TraceID: 103, SpanID: 1033, ParentID: 1032, Service: "x1", Name: "y1", Resource: "z1", Duration: 152342344}, + } + + assert.NotEqual(testComputeServiceSignature(t1), testComputeServiceSignature(t2)) +} + +func TestServiceSignatureDifferentEnv(t *testing.T) { + assert := assert.New(t) + + t1 := pb.Trace{ + &pb.Span{TraceID: 101, SpanID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 26965, Meta: map[string]string{"env": "test"}}, + &pb.Span{TraceID: 101, SpanID: 1012, ParentID: 1011, Service: "x1", Name: "y1", Resource: "z1", Duration: 197884}, + &pb.Span{TraceID: 101, SpanID: 1013, ParentID: 1012, Service: "x1", Name: "y1", Resource: "z1", Duration: 12304982304}, + &pb.Span{TraceID: 101, SpanID: 1014, ParentID: 1013, Service: "x2", Name: "y2", Resource: "z2", Duration: 34384993}, + } + t2 := pb.Trace{ + &pb.Span{TraceID: 110, SpanID: 1101, Service: "x1", Name: "y1", Resource: "z1", Duration: 992312, Meta: map[string]string{"env": "prod"}}, + &pb.Span{TraceID: 110, SpanID: 1102, ParentID: 1101, Service: "x1", Name: "y1", Resource: "z1", Duration: 34347}, + &pb.Span{TraceID: 110, SpanID: 1103, ParentID: 1101, Service: "x2", Name: "y2", Resource: "z2", Duration: 349944}, + } + + assert.NotEqual(testComputeServiceSignature(t1), testComputeServiceSignature(t2)) +} diff --git a/pkg/trace/sampler/state.go b/pkg/trace/sampler/state.go new file mode 100644 index 0000000000000..554e9d2250675 --- /dev/null +++ b/pkg/trace/sampler/state.go @@ -0,0 +1,23 @@ +package sampler + +// InternalState exposes all the main internal settings of the score sampler +type InternalState struct { + Offset float64 + Slope float64 + Cardinality int64 + InTPS float64 + OutTPS float64 + MaxTPS float64 +} + +// GetState collects and return internal statistics and coefficients for indication purposes +func (s *Sampler) GetState() InternalState { + return InternalState{ + Offset: s.signatureScoreOffset.Load(), + Slope: s.signatureScoreSlope.Load(), + Cardinality: s.Backend.GetCardinality(), + InTPS: s.Backend.GetTotalScore(), + OutTPS: s.Backend.GetSampledScore(), + MaxTPS: s.maxTPS, + } +} diff --git a/pkg/trace/test/agent.go b/pkg/trace/test/agent.go new file mode 100644 index 0000000000000..14a43352cd7c8 --- /dev/null +++ b/pkg/trace/test/agent.go @@ -0,0 +1,182 @@ +package test + +import ( + "bytes" + "errors" + "fmt" + "io/ioutil" + "log" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/spf13/viper" + yaml "gopkg.in/yaml.v2" +) + +// ErrNotInstalled is returned when the trace-agent can not be found in $PATH. +var ErrNotInstalled = errors.New("agent: trace-agent not found in $PATH") + +type agentRunner struct { + mu sync.RWMutex // guards pid + pid int // agent pid, if running + + port int // agent port + log *safeBuffer // agent log + ddAddr string // Datadog API address (host:port) + verbose bool +} + +func newAgentRunner(ddAddr string, verbose bool) (*agentRunner, error) { + if _, err := exec.LookPath("trace-agent"); err != nil { + // trace-agent not in $PATH, try to install + if verbose { + log.Print("agent: trace-agent not found, trying to install...") + } + err := exec.Command("go", "install", "github.com/DataDog/datadog-trace-agent/cmd/trace-agent").Run() + if err != nil { + return nil, ErrNotInstalled + } + if _, err := exec.LookPath("trace-agent"); err != nil { + // still not in $PATH, fail + if verbose { + log.Print("trace-agent installed but not found in $PATH") + } + return nil, ErrNotInstalled + } + } + return &agentRunner{ + ddAddr: ddAddr, + log: newSafeBuffer(), + verbose: verbose, + }, nil +} + +// Run runs the agent using a given yaml config. If an agent is already running, +// it will be killed. +func (s *agentRunner) Run(conf []byte) error { + cfgPath, err := s.createConfigFile(conf) + if err != nil { + return fmt.Errorf("agent: error creating config: %v", err) + } + timeout := time.After(5 * time.Second) + exit := s.runAgentConfig(cfgPath) + for { + select { + case err := <-exit: + return fmt.Errorf("agent: %v, log output:\n%s", err, s.Log()) + case <-timeout: + return fmt.Errorf("agent: timed out waiting for start, log:\n%s", s.Log()) + default: + if strings.Contains(s.log.String(), "listening for traces at") { + if s.verbose { + log.Print("agent: listening for traces") + } + return nil + } + time.Sleep(5 * time.Millisecond) + } + } +} + +// Log returns the tail of the agent log (up to 1M). +func (s *agentRunner) Log() string { return s.log.String() } + +// PID returns the process ID of the trace-agent. If the trace-agent is not running +// as a child process of this program, it will be 0. +func (s *agentRunner) PID() int { + s.mu.RLock() + defer s.mu.RUnlock() + return s.pid +} + +// Addr returns the address of the trace agent receiver. +func (s *agentRunner) Addr() string { return fmt.Sprintf("localhost:%d", s.port) } + +// Kill stops a running trace-agent, if it was started by this process. +func (s *agentRunner) Kill() { + pid := s.PID() + if pid == 0 { + return + } + proc, err := os.FindProcess(pid) + if err != nil { + return + } + if err := proc.Kill(); err != nil { + if s.verbose { + log.Print("couldn't kill running agent: ", err) + } + } + proc.Wait() +} + +func (s *agentRunner) runAgentConfig(path string) <-chan error { + s.Kill() + cmd := exec.Command("trace-agent", "-config", path) + s.log.Reset() + cmd.Stdout = s.log + cmd.Stderr = ioutil.Discard + cmd.Start() + + s.mu.Lock() + s.pid = cmd.Process.Pid + s.mu.Unlock() + + ch := make(chan error, 1) // don't block + go func() { + ch <- cmd.Wait() + os.Remove(path) + s.mu.Lock() + s.pid = 0 + s.mu.Unlock() + if s.verbose { + log.Print("agent: killed") + } + }() + return ch +} + +// createConfigFile creates a config file from the given config, altering the +// apm_config.apm_dd_url and log_level values and returns the full path. +func (s *agentRunner) createConfigFile(conf []byte) (string, error) { + v := viper.New() + v.SetConfigType("yaml") + if err := v.ReadConfig(bytes.NewReader(conf)); err != nil { + return "", err + } + s.port = 8126 + if v.IsSet("apm_config.receiver_port") { + s.port = v.GetInt("apm_config.receiver_port") + } + v.Set("apm_config.apm_dd_url", "http://"+s.ddAddr) + if !v.IsSet("api_key") { + v.Set("api_key", "testing123") + } + if !v.IsSet("apm_config.trace_writer.flush_period_seconds") { + v.Set("apm_config.trace_writer.flush_period_seconds", 0.1) + } + v.Set("log_level", "info") + out, err := yaml.Marshal(v.AllSettings()) + if err != nil { + return "", err + } + dir, err := ioutil.TempDir("", "agent-conf-") + if err != nil { + return "", err + } + f, err := os.Create(filepath.Join(dir, "datadog.yaml")) + if err != nil { + return "", err + } + if _, err := f.Write(out); err != nil { + return "", err + } + if err := f.Close(); err != nil { + return "", err + } + return f.Name(), nil +} diff --git a/pkg/trace/test/backend.go b/pkg/trace/test/backend.go new file mode 100644 index 0000000000000..0a9e175ce2ed1 --- /dev/null +++ b/pkg/trace/test/backend.go @@ -0,0 +1,152 @@ +package test + +import ( + "compress/gzip" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "sync/atomic" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/gogo/protobuf/proto" +) + +// defaultBackendAddress is the default listening address for the fake +// backend. +const defaultBackendAddress = "localhost:8888" + +// defaultChannelSize is the default size of the buffered channel +// receiving any payloads sent by the trace-agent to the backend. +const defaultChannelSize = 100 + +type fakeBackend struct { + srv http.Server + out chan interface{} // payload output + started uint64 // 0 if server is stopped +} + +func newFakeBackend(channelSize int) *fakeBackend { + size := defaultChannelSize + if channelSize != 0 { + size = channelSize + } + fb := fakeBackend{ + out: make(chan interface{}, size), + } + mux := http.NewServeMux() + mux.HandleFunc("/api/v0.2/traces", fb.handleTraces) + mux.HandleFunc("/api/v0.2/stats", fb.handleStats) + mux.HandleFunc("/_health", fb.handleHealth) + + fb.srv = http.Server{ + Addr: defaultBackendAddress, + Handler: mux, + } + return &fb +} + +func (s *fakeBackend) Start() error { + if atomic.LoadUint64(&s.started) > 0 { + // already running + return nil + } + go func() { + atomic.StoreUint64(&s.started, 1) + defer atomic.StoreUint64(&s.started, 0) + if err := s.srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("server: %v", err) + } + }() + + timeout := time.After(5 * time.Second) + for { + select { + case <-timeout: + return errors.New("server: timed out out waiting for start") + default: + resp, err := http.Get(fmt.Sprintf("http://%s/_health", s.srv.Addr)) + if err == nil && resp.StatusCode == http.StatusOK { + return nil + } + time.Sleep(5 * time.Millisecond) + } + } +} + +func (s *fakeBackend) Out() <-chan interface{} { return s.out } + +// Shutdown shuts down the backend and stops any running agent. +func (s *fakeBackend) Shutdown(wait time.Duration) error { + defer close(s.out) + + ctx, _ := context.WithTimeout(context.Background(), wait) + return s.srv.Shutdown(ctx) +} + +func (s *fakeBackend) handleHealth(w http.ResponseWriter, req *http.Request) { + w.WriteHeader(http.StatusOK) +} + +func (s *fakeBackend) handleStats(w http.ResponseWriter, req *http.Request) { + var payload agent.StatsPayload + if err := readJSONRequest(req, &payload); err != nil { + log.Println("server: error reading stats: ", err) + } + s.out <- payload +} + +func (s *fakeBackend) handleTraces(w http.ResponseWriter, req *http.Request) { + var payload pb.TracePayload + if err := readProtoRequest(req, &payload); err != nil { + log.Println("server: error reading traces: ", err) + } + s.out <- payload +} + +func readJSONRequest(req *http.Request, v interface{}) error { + rc, err := readCloserFromRequest(req) + if err != nil { + return err + } + defer rc.Close() + return json.NewDecoder(rc).Decode(v) +} + +func readProtoRequest(req *http.Request, msg proto.Message) error { + rc, err := readCloserFromRequest(req) + if err != nil { + return err + } + slurp, err := ioutil.ReadAll(rc) + defer rc.Close() + if err != nil { + return err + } + return proto.Unmarshal(slurp, msg) +} + +func readCloserFromRequest(req *http.Request) (io.ReadCloser, error) { + rc := struct { + io.Reader + io.Closer + }{ + Reader: req.Body, + Closer: req.Body, + } + if req.Header.Get("Accept-Encoding") == "gzip" { + gz, err := gzip.NewReader(req.Body) + if err != nil { + return nil, err + } + defer gz.Close() + rc.Reader = gz + } + return rc, nil +} diff --git a/pkg/trace/test/buffer.go b/pkg/trace/test/buffer.go new file mode 100644 index 0000000000000..ede410c9e2ef2 --- /dev/null +++ b/pkg/trace/test/buffer.go @@ -0,0 +1,56 @@ +package test + +import ( + "sync" +) + +const defaultBufferSize = 1e9 // 1M + +// safeBuffer is a thread safe buffer implementation which acts like a rolling +// buffer based on the size of the internal slice. +type safeBuffer struct { + mu sync.RWMutex + b []byte + off int +} + +// newSafeBuffer returns a new safe buffer with a default rolling size. +func newSafeBuffer() *safeBuffer { + return newSafeBufferWithSize(defaultBufferSize) +} + +// newSafeBuffer returns a new safe buffer having the given size. +func newSafeBufferWithSize(size int) *safeBuffer { + return &safeBuffer{b: make([]byte, size)} +} + +func (sb *safeBuffer) Reset() { + sb.mu.Lock() + sb.off = 0 + sb.mu.Unlock() +} + +func (sb *safeBuffer) String() string { + sb.mu.RLock() + defer sb.mu.RUnlock() + return string(sb.b[:sb.off]) +} + +func (sb *safeBuffer) Write(p []byte) (int, error) { + sb.mu.Lock() + defer sb.mu.Unlock() + n := len(p) + if n >= len(sb.b) { + // p is bigger than the whole buffer; we store only + // the last len(sb.b) bytes + sb.off = copy(sb.b, p[n-len(sb.b):]) + return n, nil + } + if n > len(sb.b)-sb.off { + // shift to make space in the buffer + copy(sb.b, sb.b[n-(len(sb.b)-sb.off):sb.off]) + sb.off = len(sb.b) - n + } + sb.off += copy(sb.b[sb.off:], p) + return n, nil +} diff --git a/pkg/trace/test/buffer_test.go b/pkg/trace/test/buffer_test.go new file mode 100644 index 0000000000000..fce6114672b41 --- /dev/null +++ b/pkg/trace/test/buffer_test.go @@ -0,0 +1,33 @@ +package test + +import ( + "testing" +) + +func TestSafeBuffer(t *testing.T) { + sb := newSafeBufferWithSize(10) + for i, tt := range []struct { + in string + out string + }{ + {"12345", "12345"}, + {"67", "1234567"}, + {"123456", "4567123456"}, + {"789", "7123456789"}, + {"abcdefg", "789abcdefg"}, + {"abcdefghij", "abcdefghij"}, + {"abcdefghijklmnop", "ghijklmnop"}, + } { + n, err := sb.Write([]byte(tt.in)) + if err != nil { + t.Fatal(err) + } + if n != len(tt.in) { + t.Fatalf("wrote %d instead of %d on step %d", n, len(tt.in), i) + } + if sb.String() != tt.out { + t.Fatalf("got %q, wanted %q", sb.String(), tt.out) + } + } + +} diff --git a/pkg/trace/test/doc.go b/pkg/trace/test/doc.go new file mode 100644 index 0000000000000..3294486074711 --- /dev/null +++ b/pkg/trace/test/doc.go @@ -0,0 +1,4 @@ +// Package test provides utilities for running integration tests on the trace agent. +// You may use the runner to start a fake backend, a trace-agent instance with a custom +// configuration, post payloads to the agent and assert the results. +package test diff --git a/pkg/trace/test/example_test.go b/pkg/trace/test/example_test.go new file mode 100644 index 0000000000000..e906fe45a6686 --- /dev/null +++ b/pkg/trace/test/example_test.go @@ -0,0 +1,48 @@ +package test + +import ( + "fmt" + "io/ioutil" + "log" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" +) + +// The below example shows a common use-case scenario for the runner. +func Example() { + var runner Runner + // Start the runner. + if err := runner.Start(); err != nil { + log.Fatal(err) + } + defer log.Fatal(runner.Shutdown(time.Second)) + + // Run an agent with a given config. + conf, err := ioutil.ReadFile("/opt/datadog-agent/etc/datadog.yaml") + if err != nil { + log.Fatal(err) + } + if err := runner.RunAgent(conf); err != nil { + log.Fatal(err) + } + + // Post a payload. + payload := pb.Traces{ + pb.Trace{testutil.RandomSpan()}, + pb.Trace{testutil.RandomSpan()}, + } + if err := runner.Post(payload); err != nil { + log.Fatal(err) + } + + // Assert the results. + switch v := (<-runner.Out()).(type) { + case pb.TracePayload: + fmt.Println("OK traces: ", len(v.Traces)) + case agent.StatsPayload: + fmt.Println("OK stats: ", len(v.Stats)) + } +} diff --git a/pkg/trace/test/runner.go b/pkg/trace/test/runner.go new file mode 100644 index 0000000000000..b5376d80defdc --- /dev/null +++ b/pkg/trace/test/runner.go @@ -0,0 +1,122 @@ +package test + +import ( + "bytes" + "errors" + "fmt" + "net/http" + "strconv" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/tinylib/msgp/msgp" +) + +// ErrNotStarted is returned when attempting to operate an unstarted Runner. +var ErrNotStarted = errors.New("runner: not started") + +// Runner can start an agent instance using a custom configuration, send payloads +// to it and act as a fake backend. Call Start first to initiate the fake backend, +// then RunAgent to start agent instances. Post may be used to send payloads to the +// agent and Out to receive its output. +type Runner struct { + // Verbose will make the runner output more verbose, more specifically + // around operations regarding the trace-agent process. + Verbose bool + + // ChannelSize specifies the size of the payload buffer of the fake backend. + // If reached, HTTP handlers will block until payloads are received from + // the out channel. It defaults to 100. + ChannelSize int + + agent *agentRunner + backend *fakeBackend +} + +// Start initializes the runner and starts the fake backend. +func (s *Runner) Start() error { + s.backend = newFakeBackend(s.ChannelSize) + agent, err := newAgentRunner(s.backend.srv.Addr, s.Verbose) + if err != nil { + return err + } + s.agent = agent + return s.backend.Start() +} + +// Shutdown stops any running agent and shuts down the fake backend. +func (s *Runner) Shutdown(wait time.Duration) error { + if s.agent == nil || s.backend == nil { + return ErrNotStarted + } + s.agent.Kill() + if err := s.backend.Shutdown(wait); err != nil { + return err + } + s.agent = nil + s.backend = nil + return nil +} + +// RunAgent starts an agent instance using the given YAML configuration. +func (s *Runner) RunAgent(conf []byte) error { + if s.agent == nil { + return ErrNotStarted + } + return s.agent.Run(conf) +} + +// AgentLog returns up to 1MB of tail from the trace agent log. +func (s *Runner) AgentLog() string { + if s.agent == nil { + return "" + } + return s.agent.Log() +} + +// KillAgent kills any agent that was started by this runner. +func (s *Runner) KillAgent() { + if s.agent == nil { + return + } + s.agent.Kill() +} + +// Out returns a channel which will provide payloads received by the fake backend. +// They can be of type pb.TracePayload or agent.StatsPayload. +func (s *Runner) Out() <-chan interface{} { + if s.backend == nil { + closedCh := make(chan interface{}) + close(closedCh) + return closedCh + } + return s.backend.Out() +} + +// Post posts the given list of traces to the trace agent. Before posting, agent must +// be started. You can start an agent using RunAgent. +func (s *Runner) Post(traceList pb.Traces) error { + if s.agent == nil { + return ErrNotStarted + } + if s.agent.PID() == 0 { + return errors.New("post: trace-agent not running") + } + + var buf bytes.Buffer + if err := msgp.Encode(&buf, traceList); err != nil { + return err + } + addr := fmt.Sprintf("http://%s/v0.3/traces", s.agent.Addr()) + req, err := http.NewRequest("POST", addr, &buf) + if err != nil { + return err + } + req.Header.Set("X-Datadog-Trace-Count", strconv.Itoa(len(traceList))) + req.Header.Set("Content-Type", "application/msgpack") + req.Header.Set("Content-Length", strconv.Itoa(buf.Len())) + + _, err = http.DefaultClient.Do(req) + // TODO: check response + return err +} diff --git a/pkg/trace/test/testsuite/hostname_test.go b/pkg/trace/test/testsuite/hostname_test.go new file mode 100644 index 0000000000000..8bc7e1676bd48 --- /dev/null +++ b/pkg/trace/test/testsuite/hostname_test.go @@ -0,0 +1,100 @@ +package testsuite + +import ( + "os" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" + "github.com/DataDog/datadog-agent/pkg/trace/test" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" +) + +func TestHostname(t *testing.T) { + r := test.Runner{} + if err := r.Start(); err != nil { + t.Fatal(err) + } + defer func() { + if err := r.Shutdown(time.Second); err != nil { + t.Log("shutdown: ", err) + } + }() + + // testHostname returns a test which asserts that for the given agent conf, the + // expectedHostname is sent to the backend. + testHostname := func(conf []byte, expectedHostname string) func(*testing.T) { + return func(t *testing.T) { + if err := r.RunAgent(conf); err != nil { + t.Fatal(err) + } + defer r.KillAgent() + + payload := pb.Traces{pb.Trace{testutil.RandomSpan()}} + payload[0][0].Metrics[sampler.KeySamplingPriority] = 2 + if err := r.Post(payload); err != nil { + t.Fatal(err) + } + waitForTrace(t, r.Out(), func(v pb.TracePayload) { + if n := len(v.Traces); n != 1 { + t.Fatalf("expected %d traces, got %d", len(payload), n) + } + if v.HostName != expectedHostname { + t.Fatalf("expected %q, got %q", expectedHostname, v.HostName) + } + }) + } + } + + t.Run("from-config", testHostname([]byte(`hostname: asdq`), "asdq")) + + t.Run("env", func(t *testing.T) { + os.Setenv("DD_HOSTNAME", "my-env-host") + defer os.Unsetenv("DD_HOSTNAME") + testHostname([]byte(`hostname: my-host`), "my-env-host")(t) + }) + + t.Run("auto", func(t *testing.T) { + if err := r.RunAgent(nil); err != nil { + t.Fatal(err) + } + defer r.KillAgent() + + payload := pb.Traces{pb.Trace{testutil.RandomSpan()}} + payload[0][0].Metrics[sampler.KeySamplingPriority] = 2 + if err := r.Post(payload); err != nil { + t.Fatal(err) + } + waitForTrace(t, r.Out(), func(v pb.TracePayload) { + if n := len(v.Traces); n != 1 { + t.Fatalf("expected %d traces, got %d", len(payload), n) + } + if v.HostName == "" { + t.Fatal("hostname detection failed") + } + }) + }) +} + +// waitForTrace waits on the out channel until it times out or receives an pb.TracePayload. +// If the latter happens it will call fn. +func waitForTrace(t *testing.T, out <-chan interface{}, fn func(pb.TracePayload)) { + waitForTraceTimeout(t, out, 3*time.Second, fn) +} + +// waitForTraceTimeout behaves like waitForTrace but allows a customizable wait time. +func waitForTraceTimeout(t *testing.T, out <-chan interface{}, wait time.Duration, fn func(pb.TracePayload)) { + timeout := time.After(wait) + for { + select { + case p := <-out: + if v, ok := p.(pb.TracePayload); ok { + fn(v) + return + } + case <-timeout: + t.Fatal("timed out") + } + } +} diff --git a/pkg/trace/test/testutil/backoff.go b/pkg/trace/test/testutil/backoff.go new file mode 100644 index 0000000000000..933f866278ae1 --- /dev/null +++ b/pkg/trace/test/testutil/backoff.go @@ -0,0 +1,59 @@ +package testutil + +import "time" + +// TestBackoffTimer is a backoff timer that ticks on-demand. +type TestBackoffTimer struct { + tickChannel chan time.Time +} + +// NewTestBackoffTimer creates a new instance of a TestBackoffTimer. +func NewTestBackoffTimer() *TestBackoffTimer { + return &TestBackoffTimer{ + // tick channel without buffer allows us to sync with the sender during the tests by sending ticks + tickChannel: make(chan time.Time), + } +} + +// ScheduleRetry on a TestBackoffTimer is a no-op. +func (t *TestBackoffTimer) ScheduleRetry(err error) (int, time.Duration) { + // Do nothing, we'll trigger whenever we want + return 0, 0 +} + +// CurrentDelay in a TestBackoffTimer always returns 0. +func (t *TestBackoffTimer) CurrentDelay() time.Duration { + // This timer doesn't have delays, it's triggered on-demand + return 0 +} + +// NumRetries in a TestBackoffTimer always returns 0. +func (t *TestBackoffTimer) NumRetries() int { + // This timer doesn't keep track of num retries + return 0 +} + +// ReceiveTick returns the channel where ticks are sent. +func (t *TestBackoffTimer) ReceiveTick() <-chan time.Time { + return t.tickChannel +} + +// TriggerTick immediately sends a tick with the current timestamp through the ticking channel. +func (t *TestBackoffTimer) TriggerTick() { + t.tickChannel <- time.Now() +} + +// Reset in a TestBackoffTimer is a no-op. +func (t *TestBackoffTimer) Reset() { + // Nothing to reset +} + +// Stop in a TestBackoffTimer is a no-op. +func (t *TestBackoffTimer) Stop() { + // Nothing to stop +} + +// Close closes the ticking channel of this backoff timer. +func (t *TestBackoffTimer) Close() { + close(t.tickChannel) +} diff --git a/pkg/trace/test/testutil/random.go b/pkg/trace/test/testutil/random.go new file mode 100644 index 0000000000000..f878917c7551a --- /dev/null +++ b/pkg/trace/test/testutil/random.go @@ -0,0 +1,31 @@ +package testutil + +import ( + "bytes" + "math/rand" + "strconv" +) + +// RandomSizedBytes creates a random byte slice with the specified size. +func RandomSizedBytes(size int) []byte { + buffer := bytes.Buffer{} + + for i := 0; i < size; i++ { + buffer.WriteByte(byte(rand.Int())) + } + + return buffer.Bytes() +} + +// RandomStringMap creates a random map with string keys and values. +func RandomStringMap() map[string]string { + length := rand.Intn(32) + + m := map[string]string{} + + for i := 0; i < length; i++ { + m[strconv.Itoa(rand.Int())] = strconv.Itoa(rand.Int()) + } + + return m +} diff --git a/pkg/trace/test/testutil/sampler.go b/pkg/trace/test/testutil/sampler.go new file mode 100644 index 0000000000000..18c1725a82f99 --- /dev/null +++ b/pkg/trace/test/testutil/sampler.go @@ -0,0 +1,42 @@ +package testutil + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/sampler" +) + +// MockEngine mocks a sampler engine +type MockEngine struct { + wantSampled bool + wantRate float64 +} + +// NewMockEngine returns a MockEngine for tests +func NewMockEngine(wantSampled bool, wantRate float64) *MockEngine { + return &MockEngine{wantSampled: wantSampled, wantRate: wantRate} +} + +// Sample returns a constant rate +func (e *MockEngine) Sample(_ pb.Trace, _ *pb.Span, _ string) (bool, float64) { + return e.wantSampled, e.wantRate +} + +// Run mocks Engine.Run() +func (e *MockEngine) Run() { + return +} + +// Stop mocks Engine.Stop() +func (e *MockEngine) Stop() { + return +} + +// GetState mocks Engine.GetState() +func (e *MockEngine) GetState() interface{} { + return nil +} + +// GetType mocks Engine.GetType() +func (e *MockEngine) GetType() sampler.EngineType { + return sampler.NormalScoreEngineType +} diff --git a/pkg/trace/test/testutil/services.go b/pkg/trace/test/testutil/services.go new file mode 100644 index 0000000000000..3c20801448047 --- /dev/null +++ b/pkg/trace/test/testutil/services.go @@ -0,0 +1,29 @@ +package testutil + +import ( + "fmt" + "math/rand" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// RandomServices generates random services metadata +func RandomServices(maxServices, maxTags int) pb.ServicesMetadata { + services := make(map[string]map[string]string) + + k := 0 + nbServices := 1 + rand.Intn(maxServices-1) + for i := 0; i < nbServices; i++ { + service := fmt.Sprintf("service%03d", i) + services[service] = make(map[string]string) + nbTags := 1 + rand.Intn(maxTags-1) + for j := 0; j < nbTags; j++ { + key := fmt.Sprintf("key%05d", k) + value := fmt.Sprintf("value%04d", k) + services[service][key] = value + k++ + } + } + + return services +} diff --git a/pkg/trace/test/testutil/span.go b/pkg/trace/test/testutil/span.go new file mode 100644 index 0000000000000..b4b41b11a27b4 --- /dev/null +++ b/pkg/trace/test/testutil/span.go @@ -0,0 +1,329 @@ +// In this file we define methods and global variables to: +// allow generation of arbitrary/random VALID spans +// pick random attributes for a span + +package testutil + +import ( + "math/rand" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" +) + +// YearNS is the number of nanoseconds in a year +var YearNS = time.Duration(time.Hour * 24 * 365).Nanoseconds() + +var durations = []int64{ + 1 * 1e3, // 1us + 10 * 1e3, // 10us + 100 * 1e3, // 100us + 1 * 1e6, // 1ms + 50 * 1e6, // 50ms + 100 * 1e6, // 100ms + 500 * 1e6, // 500ms + 1 * 1e9, // 1s + 2 * 1e9, // 2s + 10 * 1e9, // 10s +} + +var errors = []int32{ + 0, + 1, + 2, + 400, + 403, + 502, +} + +var resources = []string{ + "GET cache|xxx", + "events.buckets", + "SELECT user.handle AS user_handle, user.id AS user_id, user.org_id AS user_org_id, user.password AS user_password, user.email AS user_email, user.name AS user_name, user.role AS user_role, user.team AS user_team, user.support AS user_support, user.is_admin AS user_is_admin, user.github_username AS user_github_username, user.github_token AS user_github_token, user.disabled AS user_disabled, user.verified AS user_verified, user.bot AS user_bot, user.created AS user_created, user.modified AS user_modified, user.time_zone AS user_time_zone, user.password_modified AS user_password_modified FROM user WHERE user.id = ? AND user.org_id = ? LIMIT ?", + "データの犬", + "GET /url/test/fixture/resource/42", +} + +var services = []string{ + "rails", + "django", + "web-billing", + "pg-master", + "pylons", +} + +var names = []string{ + "web.query", + "sqlalchemy", + "web.template", + "pylons.controller", + "postgres.query", +} + +var metas = map[string][]string{ + "query": []string{ + "GET beaker:c76db4c3af90410197cf88b0afba4942:session", + "SELECT id\n FROM ddsuperuser\n WHERE id = %(id)s", + "\n -- get_contexts_sub_query[[org:9543 query_id:a135e15e7d batch:1]]\n WITH sub_contexts as (\n \n -- \n --\n SELECT key,\n host_name,\n device_name,\n tags,\n org_id\n FROM vs9543.dim_context c\n WHERE key = ANY(%(key)s)\n \n \n \n \n \n )\n \n -- \n --\n SELECT key,\n host_name,\n device_name,\n tags\n FROM sub_contexts c\n WHERE (c.org_id = %(org_id)s AND c.tags @> %(yes_tags0)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags1)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags2)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags3)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags4)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags5)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags6)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags7)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags8)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags9)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags10)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags11)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags12)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags13)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags14)s)\n OR (c.org_id = %(org_id)s AND c.tags @> %(yes_tags15)s)\n \n \n \n \n \n ", + }, + "in.host": []string{ + "8.8.8.8", + "172.0.0.42", + "2a01:e35:2ee1:7160:f66d:4ff:fe71:b690", + "postgres.service.consul", + "", + }, + "out.host": []string{ + "/dev/null", + "138.195.130.42", + "raclette.service", + "datadoghq.com", + }, + "in.section": []string{ + "4242", + "22", + "dogdataprod", + "replica", + }, + "out.section": []string{ + "-", + "8080", + "standby", + "proxy-XXX", + }, + "user": []string{ + "mattp", + "bartek", + "benjamin", + "leo", + }, +} + +var metrics = []string{ + "rowcount", + "size", + "payloads", + "loops", + "heap_allocated", + "results", +} + +var types = []string{ + "http", + "sql", + "redis", + "lamar", +} + +type sliceRandomizer interface { + Len() int + Get(int) interface{} +} + +type int64Slice []int64 + +func (s int64Slice) Len() int { return len(s) } +func (s int64Slice) Get(i int) interface{} { return s[i] } + +type int32Slice []int32 + +func (s int32Slice) Len() int { return len(s) } +func (s int32Slice) Get(i int) interface{} { return s[i] } + +type stringSlice []string + +func (s stringSlice) Len() int { return len(s) } +func (s stringSlice) Get(i int) interface{} { return s[i] } + +func randomChoice(s sliceRandomizer) interface{} { + if s.Len() == 0 { + return nil + } + return s.Get(rand.Intn(s.Len())) +} + +func int64RandomChoice(s []int64) int64 { + return randomChoice(int64Slice(s)).(int64) +} + +func int32RandomChoice(s []int32) int32 { + return randomChoice(int32Slice(s)).(int32) +} + +func stringRandomChoice(s []string) string { + return randomChoice(stringSlice(s)).(string) +} + +func randomTime() time.Time { + // we don't do rand.Int63() nanosecs because the given epoch + // (after 2300) can overflow. + // any time between now and the next year is good enough + return time.Now().Add(time.Duration(rand.Int63n(YearNS))) +} + +// RandomSpanDuration generates a random span duration +func RandomSpanDuration() int64 { + return int64RandomChoice(durations) +} + +// RandomSpanError generates a random span error code +func RandomSpanError() int32 { + return int32RandomChoice(errors) +} + +// RandomSpanResource generates a random span resource string +func RandomSpanResource() string { + return stringRandomChoice(resources) +} + +// RandomSpanService generates a random span service string +func RandomSpanService() string { + return stringRandomChoice(services) +} + +// RandomSpanName generates a random span name string +func RandomSpanName() string { + return stringRandomChoice(names) +} + +// RandomSpanID generates a random span ID +func RandomSpanID() uint64 { + return uint64(rand.Int63()) +} + +// RandomSpanStart generates a span start timestamp +func RandomSpanStart() int64 { + // Make sure spans end in the past + maxDuration := time.Duration(durations[len(durations)-1]) + offset := time.Duration(rand.Intn(10)) * time.Second + return time.Now().Add(-1 * maxDuration).Add(-1 * offset).UnixNano() +} + +// RandomSpanTraceID generates a random trace ID +func RandomSpanTraceID() uint64 { + return RandomSpanID() +} + +// RandomSpanMeta generates some random span metadata +func RandomSpanMeta() map[string]string { + res := make(map[string]string) + + // choose some of the keys + n := rand.Intn(len(metas)) + i := 0 + for k, s := range metas { + if i > n { + break + } + res[k] = stringRandomChoice(s) + i++ + } + + return res +} + +// RandomSpanMetrics generates some random span metrics +func RandomSpanMetrics() map[string]float64 { + res := make(map[string]float64) + + // choose some keys + n := rand.Intn(len(metrics)) + for _, i := range rand.Perm(n) { + res[metrics[i]] = rand.Float64() + } + + return res +} + +// RandomSpanParentID generates a random span parent ID +func RandomSpanParentID() uint64 { + return RandomSpanID() +} + +// RandomSpanType generates a random span type +func RandomSpanType() string { + return stringRandomChoice(types) +} + +// RandomSpan generates a wide-variety of spans, useful to test robustness & performance +func RandomSpan() *pb.Span { + return &pb.Span{ + Duration: RandomSpanDuration(), + Error: RandomSpanError(), + Resource: RandomSpanResource(), + Service: RandomSpanService(), + Name: RandomSpanName(), + SpanID: RandomSpanID(), + Start: RandomSpanStart(), + TraceID: RandomSpanTraceID(), + Meta: RandomSpanMeta(), + Metrics: RandomSpanMetrics(), + ParentID: RandomSpanParentID(), + Type: RandomSpanType(), + } +} + +// RandomWeightedSpan generates a random weighted span, useful for stats tests +func RandomWeightedSpan() *agent.WeightedSpan { + s := RandomSpan() + return &agent.WeightedSpan{ + Span: s, + Weight: 1, + TopLevel: true, + } +} + +// GetTestSpan returns a Span with different fields set +func GetTestSpan() *pb.Span { + span := &pb.Span{ + TraceID: 42, + SpanID: 52, + ParentID: 42, + Type: "web", + Service: "fennel_IS amazing!", + Name: "something &&<@# that should be a metric!", + Resource: "NOT touched because it is going to be hashed", + Start: 9223372036854775807, + Duration: 9223372036854775807, + Meta: map[string]string{"http.host": "192.168.0.1"}, + Metrics: map[string]float64{"http.monitor": 41.99}, + } + trace := pb.Trace{span} + traceutil.ComputeTopLevel(trace) + return trace[0] +} + +// TestSpan returns a fix span with hardcoded info, useful for reproducible tests +func TestSpan() *pb.Span { + return &pb.Span{ + Duration: 10000000, + Error: 0, + Resource: "GET /some/raclette", + Service: "django", + Name: "django.controller", + SpanID: 42, + Start: 1472732573337575936, + TraceID: 424242, + Meta: map[string]string{ + "user": "leo", + "pool": "fondue", + }, + Metrics: map[string]float64{ + "cheese_weight": 100000.0, + }, + ParentID: 1111, + Type: "http", + } +} + +// TestWeightedSpan returns a static test weighted span for reproductive stats tests +func TestWeightedSpan() *agent.WeightedSpan { + s := TestSpan() + return &agent.WeightedSpan{ + Span: s, + Weight: 1, + TopLevel: true, + } +} diff --git a/pkg/trace/test/testutil/span_test.go b/pkg/trace/test/testutil/span_test.go new file mode 100644 index 0000000000000..6e0c6104f07e1 --- /dev/null +++ b/pkg/trace/test/testutil/span_test.go @@ -0,0 +1,25 @@ +package testutil + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/stretchr/testify/assert" +) + +func TestRandomSpan(t *testing.T) { + assert := assert.New(t) + + for i := 0; i < 1000; i++ { + s := RandomSpan() + err := agent.Normalize(s) + assert.Nil(err) + } +} + +func TestTestSpan(t *testing.T) { + assert := assert.New(t) + ts := TestSpan() + err := agent.Normalize(ts) + assert.Nil(err) +} diff --git a/pkg/trace/test/testutil/stats.go b/pkg/trace/test/testutil/stats.go new file mode 100644 index 0000000000000..1815f8d4728b7 --- /dev/null +++ b/pkg/trace/test/testutil/stats.go @@ -0,0 +1,168 @@ +package testutil + +import ( + "encoding/json" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" +) + +var defaultAggregators = []string{"service", "resource"} + +const defaultEnv = "none" + +// TestStatsBucket returns a fixed stats bucket to be used in unit tests +func TestStatsBucket() agent.StatsBucket { + srb := agent.NewStatsRawBucket(0, 1e9) + srb.HandleSpan(TestWeightedSpan(), defaultEnv, defaultAggregators, nil) + sb := srb.Export() + + // marshalling then unmarshalling data to: + // 1) make a deep copy which prevents unexpected side effects with + // Counts and Distributions sharing the same TagSets + // 2) do thing closer to what they are, for real, in production + // code as indeed, stats buckets are (un)marshalled + js, err := json.Marshal(sb) + if err != nil { + return agent.NewStatsBucket(0, 1e9) + } + var sb2 agent.StatsBucket + err = json.Unmarshal(js, &sb2) + if err != nil { + return agent.NewStatsBucket(0, 1e9) + } + return sb2 +} + +// StatsBucketWithSpans returns a stats bucket populated with spans stats +func StatsBucketWithSpans(spans []*agent.WeightedSpan) agent.StatsBucket { + srb := agent.NewStatsRawBucket(0, 1e9) + for _, s := range spans { + srb.HandleSpan(s, defaultEnv, defaultAggregators, nil) + } + return srb.Export() +} + +// RandomStatsBucket returns a bucket made from n random spans, useful to run benchmarks and tests +func RandomStatsBucket(n int) agent.StatsBucket { + spans := make([]*agent.WeightedSpan, 0, n) + for i := 0; i < n; i++ { + spans = append(spans, RandomWeightedSpan()) + } + + return StatsBucketWithSpans(spans) +} + +// TestDistroValues is a pre-defined list of values +var TestDistroValues = []int64{ + 49873, 81744, 46545, 43680, 7535, 33334, 93009, 23777, 33471, 68629, + 94601, 83827, 3556, 15913, 84957, 368, 71879, 73687, 55039, 89704, + 98733, 40820, 62839, 26673, 55731, 45477, 15893, 45488, 72297, 29134, + 57683, 6782, 10496, 16713, 62976, 7545, 87884, 7963, 16105, 28633, + 19613, 33881, 53049, 39639, 68647, 99105, 95954, 79172, 65798, 32334, + 66448, 13783, 56688, 17350, 42414, 18336, 63655, 59545, 42014, 74478, + 70263, 6860, 19339, 36375, 72034, 51899, 98473, 22231, 57126, 4482, + 31985, 35335, 89732, 58843, 28695, 50653, 23740, 29245, 72152, 16566, + 19598, 94928, 88210, 9813, 61112, 14225, 282, 40069, 80421, 71429, + 30896, 38353, 34031, 65116, 20348, 57019, 91726, 3143, 48396, 25658, + 465, 48299, 4127, 73883, 99755, 95259, 79187, 59794, 25740, 62633, + 61585, 26320, 96966, 57059, 2201, 20065, 58359, 75706, 67622, 90459, + 19300, 40384, 98456, 65224, 15020, 35819, 48079, 8554, 41658, 22967, + 28764, 78538, 78314, 73160, 8707, 83916, 7982, 38096, 45418, 78655, + 27987, 41748, 84730, 91216, 83098, 49090, 426, 48221, 26862, 70959, + 32132, 19862, 95997, 3027, 19438, 38393, 33338, 25567, 14618, 31610, + 88956, 4252, 81845, 77757, 58023, 64701, 24762, 11909, 79436, 67507, + 63004, 62749, 55296, 88204, 43255, 15385, 4404, 75079, 32425, 32088, + 35378, 83907, 15201, 37043, 49320, 3941, 10696, 77039, 45697, 33241, + 6414, 91211, 11473, 39560, 1833, 59542, 30878, 40429, 18136, 45348, + 20395, 18976, 22945, 72978, 11297, 49834, 74443, 32954, 87079, 43619, + 51680, 44241, 24348, 30395, 8241, 6038, 34042, 10788, 43017, 1706, + 41296, 87732, 17445, 90738, 12690, 7810, 14243, 10162, 26128, 36418, + 90821, 63677, 87168, 35589, 89271, 2882, 19680, 46951, 67143, 99086, + 40945, 88011, 88062, 8742, 24121, 41593, 52634, 94285, 84646, 46255, + 66570, 11781, 4395, 82956, 98527, 6198, 10414, 71817, 52338, 8849, + 70229, 54649, 98215, 81781, 28883, 50424, 65524, 89666, 18922, 25075, + 26313, 91007, 45330, 52683, 19222, 58549, 15102, 66637, 11874, 96489, + 20224, 96151, 38772, 77736, 26639, 63909, 5960, 81147, 68183, 15503, + 98095, 45086, 79831, 95974, 69140, 38202, 40126, 96299, 48670, 29259, + 21494, 60618, 45045, 63612, 56271, 57411, 90412, 43692, 4981, 79404, + 11842, 39727, 12257, 90435, 6909, 61222, 34525, 45393, 39051, 45634, + 11202, 86878, 89570, 84142, 8400, 30596, 55909, 22552, 45053, 34014, + 3546, 41567, 54300, 233, 53248, 78597, 39224, 87627, 82985, 43282, + 37318, 11994, 47289, 6375, 14274, 74678, 7444, 64063, 95054, 94864, + 56093, 11942, 66802, 71928, 816, 13229, 62403, 78549, 41223, 55717, + 19609, 56257, 28648, 6162, 3943, 9800, 97273, 30486, 50528, 66419, + 56069, 77098, 99676, 50095, 25915, 5126, 88303, 91216, 39747, 35313, + 67128, 33430, 80861, 4598, 98636, 58579, 464, 58865, 54999, 2770, + 50827, 31275, 28270, 81736, 50019, 30829, 7715, 28098, 59506, 93275, + 59696, 3620, 78626, 94467, 99199, 56480, 81559, 66099, 14158, 14121, + 58014, 77264, 36713, 23639, 99892, 28986, 15902, 71818, 40326, 6597, + 66142, 63904, 12735, 23989, 43671, 45438, 76740, 41381, 61377, 240, + 15913, 96435, 68748, 14924, 73254, 86370, 37633, 61430, 99398, 45688, + 8955, 8474, 97979, 39943, 93195, 65534, 22004, 19573, 53598, 14585, + 36601, 99530, 91841, 44689, 63644, 84307, 72608, 78387, 8859, 78854, + 50002, 22510, 85289, 95122, 5656, 25727, 79150, 55133, 4004, 96902, + 29830, 77912, 85867, 90171, 82337, 44654, 96195, 59459, 5902, 91724, + 67780, 7250, 85047, 34558, 38288, 78736, 19084, 714, 67720, 72898, + 48739, 61426, 557, 25487, 59289, 47253, 63439, 37965, 81039, 5683, + 7797, 32679, 23594, 65206, 19993, 25043, 25180, 20326, 23150, 36051, + 64304, 40757, 57203, 26517, 68184, 67824, 95437, 75023, 88923, 70288, + 24445, 3, 95502, 77711, 56441, 7932, 67526, 68888, 99420, 55438, + 46474, 56435, 72679, 99497, 49292, 97114, 74148, 11560, 90975, 11458, + 41169, 3235, 69486, 98718, 79108, 91634, 55222, 55298, 8990, 86267, + 64122, 69275, 50964, 52229, 95153, 90588, 80232, 32330, 76329, 21423, + 67743, 58663, 78473, 63279, 990, 37566, 14986, 86231, 85598, 48049, + 10363, 57368, 31711, 8906, 21830, 80262, 95792, 17164, 60127, 57617, + 20080, 21982, 64448, 20778, 72023, 86362, 36221, 55531, 23085, 99240, + 67901, 90321, 20114, 62605, 96437, 24478, 53523, 28354, 80996, 80790, + 88883, 15785, 91293, 77907, 90565, 68434, 38138, 38726, 89991, 71803, + 63103, 77849, 170, 30055, 2028, 16229, 41089, 11047, 43713, 45225, + 13700, 47201, 6036, 12316, 99542, 53145, 79478, 36265, 3113, 10984, + 49406, 60035, 62615, 80977, 71344, 14200, 95778, 22538, 60343, 67009, + 63429, 32294, 27237, 68984, 12944, 32231, 55999, 37897, 90091, 80466, + 95801, 65865, 96564, 66561, 31327, 52672, 71584, 2776, 579, 91374, + 55089, 78267, 77595, 83646, 243, 58118, 79231, 99188, 62236, 44332, + 81093, 38651, 73028, 99672, 68818, 9953, 93758, 93236, 97302, 92746, + 33019, 14922, 29229, 54180, 52829, 90520, 38644, 51461, 29513, 66800, + 22806, 54867, 48009, 46546, 25875, 30956, 31243, 68299, 16312, 85165, + 64305, 77372, 10692, 6157, 59324, 29112, 63886, 72133, 85611, 38971, + 38992, 44689, 24522, 24774, 73909, 24398, 6723, 43141, 30123, 70649, + 56382, 67159, 26385, 65003, 98672, 69931, 66304, 1286, 23984, 7956, + 37911, 53510, 43011, 75474, 73917, 1584, 66755, 64636, 14254, 74482, + 21556, 59100, 17851, 55708, 22718, 24043, 74123, 40832, 2753, 86226, + 24531, 75018, 42006, 96396, 32645, 66235, 68342, 21044, 33145, 56726, + 96180, 31556, 54782, 93490, 74270, 71721, 33153, 73493, 71298, 35333, + 22479, 57204, 38705, 94009, 44158, 15787, 26768, 64158, 22096, 88571, + 45340, 91679, 24695, 64220, 25843, 70219, 66173, 81020, 89491, 74995, + 41378, 69249, 58966, 57816, 99462, 13518, 79224, 43384, 48332, 54966, + 86665, 31802, 46214, 42160, 11404, 63242, 36740, 20740, 56772, 34968, + 2681, 93064, 73128, 93007, 29572, 29621, 55075, 8266, 91077, 94482, + 95789, 60063, 85533, 50362, 41231, 49712, 35175, 75798, 9857, 66660, + 9971, 63999, 8772, 30798, 76458, 82119, 11532, 67945, 18010, 18487, + 58024, 27592, 45792, 25909, 37864, 35835, 38423, 77954, 78700, 89430, + 50421, 59400, 76022, 95436, 65119, 27730, 66893, 1156, 85816, 88373, + 50148, 25239, 59895, 60979, 87409, 99146, 3145, 44179, 51101, 26657, + 62651, 32522, 16767, 50925, 70089, 57291, 1766, 20794, 20314, 13281, + 72751, 37028, 28871, 92784, 42352, 58356, 41326, 85658, 8758, 44966, + 11090, 2746, 71165, 58595, 52442, 38558, 74826, 6675, 32401, 83481, + 93010, 19397, 86634, 3022, 94732, 11983, 84975, 44749, 80986, 5166, + 78078, 52211, 82787, 56617, 61960, 23051, 64815, 49412, 16945, 10430, + 61046, 38824, 85281, 59365, 87997, 96782, 4978, 29164, 68042, 66505, + 68244, 92104, 2331, 10527, 15497, 57600, 91716, 12689, 51048, 95748, + 79084, 26550, 93068, 89239, 32297, 30275, 3483, 19284, 83045, 45812, + 47572, 89241, 23722, 22646, 54408, 35989, 39531, 25405, 81469, 69026, + 59956, 88882, 47029, 32217, 9265, 9337, 15567, 71576, 82557, 83448, + 76538, 95379, 97595, 30781, 54709, 40266, 97288, 89581, 97335, 54606, + 64572, 99834, 97581, 10704, 51460, 54803, 41618, 41760, 31663, 42939, + 10327, 63265, 48904, 79260, 26562, 11528, 97745, 78918, 94479, 19453, +} + +// TestDistribution returns a distribution with pre-defined values +func TestDistribution() agent.Distribution { + tgs := agent.NewTagSetFromString("service:X,host:Z") + d := agent.NewDistribution("duration", "Y|duration|service:X,host:Z", "Y", tgs) + for i, v := range TestDistroValues { + d.Add(float64(v), uint64(i)) + } + + return d +} diff --git a/pkg/trace/test/testutil/stats_test.go b/pkg/trace/test/testutil/stats_test.go new file mode 100644 index 0000000000000..eea717e5276c9 --- /dev/null +++ b/pkg/trace/test/testutil/stats_test.go @@ -0,0 +1,19 @@ +package testutil + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestRandomStatsBucket(t *testing.T) { + for i := 10; i < 100; i += 10 { + b := RandomStatsBucket(i) + assert.False(t, b.IsEmpty()) + } +} + +func TestTestStatsBucket(t *testing.T) { + b := TestStatsBucket() + assert.False(t, b.IsEmpty()) +} diff --git a/pkg/trace/test/testutil/statsd.go b/pkg/trace/test/testutil/statsd.go new file mode 100644 index 0000000000000..8c7a988cc51ec --- /dev/null +++ b/pkg/trace/test/testutil/statsd.go @@ -0,0 +1,140 @@ +package testutil + +import ( + "math" + "sync" +) + +// StatsClientGaugeArgs represents arguments to a StatsClient Gauge method call. +type StatsClientGaugeArgs struct { + Name string + Value float64 + Tags []string + Rate float64 +} + +// StatsClientCountArgs represents arguments to a StatsClient Count method call. +type StatsClientCountArgs struct { + Name string + Value int64 + Tags []string + Rate float64 +} + +// StatsClientHistogramArgs represents arguments to a StatsClient Histogram method call. +type StatsClientHistogramArgs struct { + Name string + Value float64 + Tags []string + Rate float64 +} + +// CountSummary contains a summary of all Count method calls to a particular StatsClient for a particular key. +type CountSummary struct { + Calls []StatsClientCountArgs + Sum int64 +} + +// GaugeSummary contains a summary of all Gauge method calls to a particular StatsClient for a particular key. +type GaugeSummary struct { + Calls []StatsClientGaugeArgs + Last float64 + Max float64 +} + +// TestStatsClient is a mocked StatsClient that records all calls and replies with configurable error return values. +type TestStatsClient struct { + mu sync.RWMutex + + GaugeErr error + GaugeCalls []StatsClientGaugeArgs + CountErr error + CountCalls []StatsClientCountArgs + HistogramErr error + HistogramCalls []StatsClientHistogramArgs +} + +// Reset resets client's internal records. +func (c *TestStatsClient) Reset() { + c.mu.Lock() + defer c.mu.Unlock() + c.GaugeErr = nil + c.GaugeCalls = c.GaugeCalls[:0] + c.CountErr = nil + c.CountCalls = c.CountCalls[:0] + c.HistogramErr = nil + c.HistogramCalls = c.HistogramCalls[:0] +} + +// Gauge records a call to a Gauge operation and replies with GaugeErr +func (c *TestStatsClient) Gauge(name string, value float64, tags []string, rate float64) error { + c.mu.Lock() + defer c.mu.Unlock() + c.GaugeCalls = append(c.GaugeCalls, StatsClientGaugeArgs{Name: name, Value: value, Tags: tags, Rate: rate}) + return c.GaugeErr +} + +// Count records a call to a Count operation and replies with CountErr +func (c *TestStatsClient) Count(name string, value int64, tags []string, rate float64) error { + c.mu.Lock() + defer c.mu.Unlock() + c.CountCalls = append(c.CountCalls, StatsClientCountArgs{Name: name, Value: value, Tags: tags, Rate: rate}) + return c.CountErr +} + +// Histogram records a call to a Histogram operation and replies with HistogramErr +func (c *TestStatsClient) Histogram(name string, value float64, tags []string, rate float64) error { + c.mu.Lock() + defer c.mu.Unlock() + c.HistogramCalls = append(c.HistogramCalls, StatsClientHistogramArgs{Name: name, Value: value, Tags: tags, Rate: rate}) + return c.HistogramErr +} + +// GetCountSummaries computes summaries for all names supplied as parameters to Count calls. +func (c *TestStatsClient) GetCountSummaries() map[string]*CountSummary { + result := map[string]*CountSummary{} + + c.mu.RLock() + defer c.mu.RUnlock() + for _, countCall := range c.CountCalls { + name := countCall.Name + summary, ok := result[name] + + if !ok { + summary = &CountSummary{} + result[name] = summary + } + + summary.Calls = append(summary.Calls, countCall) + summary.Sum += countCall.Value + } + + return result +} + +// GetGaugeSummaries computes summaries for all names supplied as parameters to Gauge calls. +func (c *TestStatsClient) GetGaugeSummaries() map[string]*GaugeSummary { + result := map[string]*GaugeSummary{} + + c.mu.RLock() + defer c.mu.RUnlock() + for _, gaugeCall := range c.GaugeCalls { + name := gaugeCall.Name + summary, ok := result[name] + + if !ok { + summary = &GaugeSummary{} + summary.Max = math.MinInt64 + result[name] = summary + } + + summary.Calls = append(summary.Calls, gaugeCall) + summary.Last = gaugeCall.Value + + if gaugeCall.Value > summary.Max { + summary.Max = gaugeCall.Value + } + } + + return result +} diff --git a/pkg/trace/test/testutil/testutil.go b/pkg/trace/test/testutil/testutil.go new file mode 100644 index 0000000000000..5456c9641481f --- /dev/null +++ b/pkg/trace/test/testutil/testutil.go @@ -0,0 +1,11 @@ +// Package testutil provides easy ways to generate some random +// or deterministic data that can be use for tests or benchmarks. +// +// All the publicly shared trace agent model is available. +// +// It avoids the cumbersome step of having to redefine complicated +// structs in every test case and maintain common methods for quick +// access to almost all kind of stub data needed. +// It should NEVER be imported in a program, most likely in one-off +// projects or fuzz modes or test suites. +package testutil diff --git a/pkg/trace/test/testutil/trace.go b/pkg/trace/test/testutil/trace.go new file mode 100644 index 0000000000000..1283fcb5f566c --- /dev/null +++ b/pkg/trace/test/testutil/trace.go @@ -0,0 +1,112 @@ +package testutil + +import ( + "math/rand" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" +) + +// genNextLevel generates a new level for the trace tree structure, +// having maxSpans as the max number of spans for this level +func genNextLevel(prevLevel []*pb.Span, maxSpans int) []*pb.Span { + var spans []*pb.Span + numSpans := rand.Intn(maxSpans) + 1 + + // the spans have to be "nested" in the previous level + // choose randomly spans from prev level + chosenSpans := rand.Perm(len(prevLevel)) + // cap to a random number > 1 + maxParentSpans := rand.Intn(len(prevLevel)) + if maxParentSpans == 0 { + maxParentSpans = 1 + } + chosenSpans = chosenSpans[:maxParentSpans] + + // now choose a random amount of spans per chosen span + // total needs to be numSpans + for i, prevIdx := range chosenSpans { + prev := prevLevel[prevIdx] + + var childSpans int + value := numSpans - (len(chosenSpans) - i) + if i == len(chosenSpans)-1 || value < 1 { + childSpans = numSpans + } else { + childSpans = rand.Intn(value) + } + numSpans -= childSpans + + timeLeft := prev.Duration + + // create the spans + curSpans := make([]*pb.Span, 0, childSpans) + for j := 0; j < childSpans && timeLeft > 0; j++ { + news := RandomSpan() + news.TraceID = prev.TraceID + news.ParentID = prev.SpanID + + // distribute durations in prev span + // random start + randStart := rand.Int63n(timeLeft) + news.Start = prev.Start + randStart + // random duration + timeLeft -= randStart + news.Duration = rand.Int63n(timeLeft) + timeLeft -= news.Duration + + curSpans = append(curSpans, news) + } + + spans = append(spans, curSpans...) + } + + return spans +} + +// RandomTrace generates a random trace with a depth from 1 to +// maxLevels of spans. Each level has at most maxSpans items. +func RandomTrace(maxLevels, maxSpans int) pb.Trace { + t := pb.Trace{RandomSpan()} + + prevLevel := t + maxDepth := 1 + rand.Intn(maxLevels) + + for i := 0; i < maxDepth; i++ { + if len(prevLevel) > 0 { + prevLevel = genNextLevel(prevLevel, maxSpans) + t = append(t, prevLevel...) + } + } + + return t +} + +// GetTestTrace returns a []Trace that is composed by ``traceN`` number +// of traces, each one composed by ``size`` number of spans. +func GetTestTrace(traceN, size int, realisticIDs bool) pb.Traces { + traces := pb.Traces{} + + r := rand.New(rand.NewSource(42)) + + for i := 0; i < traceN; i++ { + // Calculate a trace ID which is predictable (this is why we seed) + // but still spreads on a wide spectrum so that, among other things, + // sampling algorithms work in a realistic way. + traceID := r.Uint64() + + trace := pb.Trace{} + for j := 0; j < size; j++ { + span := GetTestSpan() + if realisticIDs { + // Need to have different span IDs else traces are rejected + // because they are not correct (indeed, a trace with several + // spans boasting the same span ID is not valid) + span.SpanID += uint64(j) + span.TraceID = traceID + } + trace = append(trace, span) + } + traces = append(traces, trace) + } + return traces +} diff --git a/pkg/trace/traceutil/doc.go b/pkg/trace/traceutil/doc.go new file mode 100644 index 0000000000000..d989a3ef74e93 --- /dev/null +++ b/pkg/trace/traceutil/doc.go @@ -0,0 +1,3 @@ +// Package traceutil contains functions for extracting and processing traces. It should +// only import payload and nothing else. +package traceutil diff --git a/pkg/trace/traceutil/span.go b/pkg/trace/traceutil/span.go new file mode 100644 index 0000000000000..08029b2955556 --- /dev/null +++ b/pkg/trace/traceutil/span.go @@ -0,0 +1,44 @@ +package traceutil + +import "github.com/DataDog/datadog-agent/pkg/trace/pb" + +const ( + // TraceMetricsKey is a tag key which, if set to true, + // ensures all statistics are computed for this span. + // [FIXME] *not implemented yet* + TraceMetricsKey = "datadog.trace_metrics" + + // This is a special metric, it's 1 if the span is top-level, 0 if not. + topLevelKey = "_top_level" +) + +// HasTopLevel returns true if span is top-level. +func HasTopLevel(s *pb.Span) bool { + return s.Metrics[topLevelKey] == 1 +} + +// HasForceMetrics returns true if statistics computation should be forced for this span. +func HasForceMetrics(s *pb.Span) bool { + return s.Meta[TraceMetricsKey] == "true" +} + +// setTopLevel sets the top-level attribute of the span. +func setTopLevel(s *pb.Span, topLevel bool) { + if !topLevel { + if s.Metrics == nil { + return + } + delete(s.Metrics, topLevelKey) + return + } + // Setting the metrics value, so that code downstream in the pipeline + // can identify this as top-level without recomputing everything. + setMetric(s, topLevelKey, 1) +} + +func setMetric(s *pb.Span, key string, val float64) { + if s.Metrics == nil { + s.Metrics = make(map[string]float64) + } + s.Metrics[key] = val +} diff --git a/pkg/trace/traceutil/span_test.go b/pkg/trace/traceutil/span_test.go new file mode 100644 index 0000000000000..ce595fd5dc153 --- /dev/null +++ b/pkg/trace/traceutil/span_test.go @@ -0,0 +1,168 @@ +package traceutil + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func TestTopLevelTypical(t *testing.T) { + assert := assert.New(t) + + tr := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: 1, SpanID: 2, ParentID: 1, Service: "mcnulty", Type: "sql"}, + &pb.Span{TraceID: 1, SpanID: 3, ParentID: 2, Service: "master-db", Type: "sql"}, + &pb.Span{TraceID: 1, SpanID: 4, ParentID: 1, Service: "redis", Type: "redis"}, + &pb.Span{TraceID: 1, SpanID: 5, ParentID: 1, Service: "mcnulty", Type: ""}, + } + + ComputeTopLevel(tr) + + assert.True(HasTopLevel(tr[0]), "root span should be top-level") + assert.False(HasTopLevel(tr[1]), "main service, and not a root span, not top-level") + assert.True(HasTopLevel(tr[2]), "only 1 span for this service, should be top-level") + assert.True(HasTopLevel(tr[3]), "only 1 span for this service, should be top-level") + assert.False(HasTopLevel(tr[4]), "yet another sup span, not top-level") +} + +func TestTopLevelSingle(t *testing.T) { + assert := assert.New(t) + + tr := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Service: "mcnulty", Type: "web"}, + } + + ComputeTopLevel(tr) + + assert.True(HasTopLevel(tr[0]), "root span should be top-level") +} + +func TestTopLevelEmpty(t *testing.T) { + assert := assert.New(t) + + tr := pb.Trace{} + + ComputeTopLevel(tr) + + assert.Equal(0, len(tr), "trace should still be empty") +} + +func TestTopLevelOneService(t *testing.T) { + assert := assert.New(t) + + tr := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 2, ParentID: 1, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: 1, SpanID: 3, ParentID: 2, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: 1, SpanID: 4, ParentID: 1, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: 1, SpanID: 5, ParentID: 1, Service: "mcnulty", Type: "web"}, + } + + ComputeTopLevel(tr) + + assert.False(HasTopLevel(tr[0]), "just a sub-span, not top-level") + assert.False(HasTopLevel(tr[1]), "just a sub-span, not top-level") + assert.True(HasTopLevel(tr[2]), "root span should be top-level") + assert.False(HasTopLevel(tr[3]), "just a sub-span, not top-level") + assert.False(HasTopLevel(tr[4]), "just a sub-span, not top-level") +} + +func TestTopLevelLocalRoot(t *testing.T) { + assert := assert.New(t) + + tr := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Service: "mcnulty", Type: "web"}, + &pb.Span{TraceID: 1, SpanID: 2, ParentID: 1, Service: "mcnulty", Type: "sql"}, + &pb.Span{TraceID: 1, SpanID: 3, ParentID: 2, Service: "master-db", Type: "sql"}, + &pb.Span{TraceID: 1, SpanID: 4, ParentID: 1, Service: "redis", Type: "redis"}, + &pb.Span{TraceID: 1, SpanID: 5, ParentID: 1, Service: "mcnulty", Type: ""}, + &pb.Span{TraceID: 1, SpanID: 6, ParentID: 4, Service: "redis", Type: "redis"}, + &pb.Span{TraceID: 1, SpanID: 7, ParentID: 4, Service: "redis", Type: "redis"}, + } + + ComputeTopLevel(tr) + + assert.True(HasTopLevel(tr[0]), "root span should be top-level") + assert.False(HasTopLevel(tr[1]), "main service, and not a root span, not top-level") + assert.True(HasTopLevel(tr[2]), "only 1 span for this service, should be top-level") + assert.True(HasTopLevel(tr[3]), "top-level but not root") + assert.False(HasTopLevel(tr[4]), "yet another sup span, not top-level") + assert.False(HasTopLevel(tr[5]), "yet another sup span, not top-level") + assert.False(HasTopLevel(tr[6]), "yet another sup span, not top-level") +} + +func TestTopLevelWithTag(t *testing.T) { + assert := assert.New(t) + + tr := pb.Trace{ + &pb.Span{TraceID: 1, SpanID: 1, ParentID: 0, Service: "mcnulty", Type: "web", Metrics: map[string]float64{"custom": 42}}, + &pb.Span{TraceID: 1, SpanID: 2, ParentID: 1, Service: "mcnulty", Type: "web", Metrics: map[string]float64{"custom": 42}}, + } + + ComputeTopLevel(tr) + + t.Logf("%v\n", tr[1].Metrics) + + assert.True(HasTopLevel(tr[0]), "root span should be top-level") + assert.Equal(float64(42), tr[0].Metrics["custom"], "custom metric should still be here") + assert.False(HasTopLevel(tr[1]), "not a top-level span") + assert.Equal(float64(42), tr[1].Metrics["custom"], "custom metric should still be here") +} + +func TestTopLevelGetSetBlackBox(t *testing.T) { + assert := assert.New(t) + + span := &pb.Span{} + + assert.False(HasTopLevel(span), "by default, all spans are considered non top-level") + setTopLevel(span, true) + assert.True(HasTopLevel(span), "marked as top-level") + setTopLevel(span, false) + assert.False(HasTopLevel(span), "no more top-level") + + span.Metrics = map[string]float64{"custom": 42} + + assert.False(HasTopLevel(span), "by default, all spans are considered non top-level") + setTopLevel(span, true) + assert.True(HasTopLevel(span), "marked as top-level") + setTopLevel(span, false) + assert.False(HasTopLevel(span), "no more top-level") +} + +func TestTopLevelGetSetMetrics(t *testing.T) { + assert := assert.New(t) + + span := &pb.Span{} + + assert.Nil(span.Metrics, "no meta at all") + setTopLevel(span, true) + assert.Equal(float64(1), span.Metrics["_top_level"], "should have a _top_level:1 flag") + setTopLevel(span, false) + assert.Equal(len(span.Metrics), 0, "no meta at all") + + span.Metrics = map[string]float64{"custom": 42} + + assert.False(HasTopLevel(span), "still non top-level") + setTopLevel(span, true) + assert.Equal(float64(1), span.Metrics["_top_level"], "should have a _top_level:1 flag") + assert.Equal(float64(42), span.Metrics["custom"], "former metrics should still be here") + assert.True(HasTopLevel(span), "marked as top-level") + setTopLevel(span, false) + assert.False(HasTopLevel(span), "non top-level any more") + assert.Equal(float64(0), span.Metrics["_top_level"], "should have no _top_level:1 flag") + assert.Equal(float64(42), span.Metrics["custom"], "former metrics should still be here") +} + +func TestForceMetrics(t *testing.T) { + assert := assert.New(t) + + span := &pb.Span{} + + assert.False(HasForceMetrics(span), "by default, metrics are not enforced for sub name spans") + span.Meta = map[string]string{"datadog.trace_metrics": "true"} + assert.True(HasForceMetrics(span), "metrics should be enforced because tag is present") + span.Meta = map[string]string{"env": "dev"} + assert.False(HasForceMetrics(span), "there's a tag, but metrics should not be enforced anyway") +} diff --git a/pkg/trace/traceutil/trace.go b/pkg/trace/traceutil/trace.go new file mode 100644 index 0000000000000..a61d7f4f61f52 --- /dev/null +++ b/pkg/trace/traceutil/trace.go @@ -0,0 +1,133 @@ +package traceutil + +import ( + "github.com/DataDog/datadog-agent/pkg/trace/pb" + log "github.com/cihub/seelog" +) + +// GetEnv returns the meta value for the "env" key for +// the first trace it finds or an empty string +func GetEnv(t pb.Trace) string { + // exit this on first success + for _, s := range t { + for k, v := range s.Meta { + if k == "env" { + return v + } + } + } + return "" +} + +// GetRoot extracts the root span from a trace +func GetRoot(t pb.Trace) *pb.Span { + // That should be caught beforehand + if len(t) == 0 { + return nil + } + // General case: go over all spans and check for one which matching parent + parentIDToChild := map[uint64]*pb.Span{} + + for i := range t { + // Common case optimization: check for span with ParentID == 0, starting from the end, + // since some clients report the root last + j := len(t) - 1 - i + if t[j].ParentID == 0 { + return t[j] + } + parentIDToChild[t[j].ParentID] = t[j] + } + + for i := range t { + if _, ok := parentIDToChild[t[i].SpanID]; ok { + delete(parentIDToChild, t[i].SpanID) + } + } + + // Here, if the trace is valid, we should have len(parentIDToChild) == 1 + if len(parentIDToChild) != 1 { + log.Debugf("didn't reliably find the root span for traceID:%v", t[0].TraceID) + } + + // Have a safe bahavior if that's not the case + // Pick the first span without its parent + for parentID := range parentIDToChild { + return parentIDToChild[parentID] + } + + // Gracefully fail with the last span of the trace + return t[len(t)-1] +} + +// APITrace returns an APITrace from the trace, as required by the Datadog API. +func APITrace(t pb.Trace) *pb.APITrace { + var earliest, latest int64 + for _, s := range t { + start := s.Start + if start < earliest { + earliest = start + } + end := s.Start + s.Duration + if end > latest { + latest = end + } + } + return &pb.APITrace{ + TraceID: t[0].TraceID, + Spans: t, + StartTime: earliest, + EndTime: latest, + } +} + +// ChildrenMap returns a map containing for each span id the list of its +// direct children. +func ChildrenMap(t pb.Trace) map[uint64][]*pb.Span { + childrenMap := make(map[uint64][]*pb.Span) + + for i := range t { + span := t[i] + if span.ParentID == 0 { + continue + } + children, ok := childrenMap[span.SpanID] + if !ok { + childrenMap[span.SpanID] = []*pb.Span{} + } + children, ok = childrenMap[span.ParentID] + if ok { + children = append(children, span) + } else { + children = []*pb.Span{span} + } + childrenMap[span.ParentID] = children + } + + return childrenMap +} + +// ComputeTopLevel updates all the spans top-level attribute. +// +// A span is considered top-level if: +// - it's a root span +// - its parent is unknown (other part of the code, distributed trace) +// - its parent belongs to another service (in that case it's a "local root" +// being the highest ancestor of other spans belonging to this service and +// attached to it). +func ComputeTopLevel(t pb.Trace) { + // build a lookup map + spanIDToIdx := make(map[uint64]int, len(t)) + for i, span := range t { + spanIDToIdx[span.SpanID] = i + } + + // iterate on each span and mark them as top-level if relevant + for _, span := range t { + if span.ParentID != 0 { + if parentIdx, ok := spanIDToIdx[span.ParentID]; ok && t[parentIdx].Service == span.Service { + continue + } + } + setTopLevel(span, true) + } +} diff --git a/pkg/trace/traceutil/trace_test.go b/pkg/trace/traceutil/trace_test.go new file mode 100644 index 0000000000000..9e869c3e66e8b --- /dev/null +++ b/pkg/trace/traceutil/trace_test.go @@ -0,0 +1,56 @@ +package traceutil + +import ( + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/stretchr/testify/assert" +) + +func TestGetRootFromCompleteTrace(t *testing.T) { + assert := assert.New(t) + + trace := pb.Trace{ + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12341), Service: "s1", Name: "n1", Resource: ""}, + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12342), ParentID: uint64(12341), Service: "s1", Name: "n1", Resource: ""}, + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12343), ParentID: uint64(12341), Service: "s1", Name: "n1", Resource: ""}, + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12344), ParentID: uint64(12342), Service: "s2", Name: "n2", Resource: ""}, + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12345), ParentID: uint64(12344), Service: "s2", Name: "n2", Resource: ""}, + } + + assert.Equal(GetRoot(trace).SpanID, uint64(12341)) +} + +func TestGetRootFromPartialTrace(t *testing.T) { + assert := assert.New(t) + + trace := pb.Trace{ + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12341), ParentID: uint64(12340), Service: "s1", Name: "n1", Resource: ""}, + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12342), ParentID: uint64(12341), Service: "s1", Name: "n1", Resource: ""}, + &pb.Span{TraceID: uint64(1234), SpanID: uint64(12343), ParentID: uint64(12342), Service: "s2", Name: "n2", Resource: ""}, + } + + assert.Equal(GetRoot(trace).SpanID, uint64(12341)) +} + +func TestTraceChildrenMap(t *testing.T) { + assert := assert.New(t) + + trace := pb.Trace{ + &pb.Span{SpanID: 1, ParentID: 0}, + &pb.Span{SpanID: 2, ParentID: 1}, + &pb.Span{SpanID: 3, ParentID: 1}, + &pb.Span{SpanID: 4, ParentID: 2}, + &pb.Span{SpanID: 5, ParentID: 3}, + &pb.Span{SpanID: 6, ParentID: 4}, + } + + childrenMap := ChildrenMap(trace) + + assert.Equal([]*pb.Span{trace[1], trace[2]}, childrenMap[1]) + assert.Equal([]*pb.Span{trace[3]}, childrenMap[2]) + assert.Equal([]*pb.Span{trace[4]}, childrenMap[3]) + assert.Equal([]*pb.Span{trace[5]}, childrenMap[4]) + assert.Equal([]*pb.Span{}, childrenMap[5]) + assert.Equal([]*pb.Span{}, childrenMap[6]) +} diff --git a/pkg/trace/watchdog/info.go b/pkg/trace/watchdog/info.go new file mode 100644 index 0000000000000..2fb9d99bd421a --- /dev/null +++ b/pkg/trace/watchdog/info.go @@ -0,0 +1,179 @@ +package watchdog + +import ( + "os" + "runtime" + "sync" + "time" + + log "github.com/cihub/seelog" + "github.com/shirou/gopsutil/process" +) + +const ( + // cacheDelay should be long enough so that we don't poll the info + // too often and waste resources doing it, and also long enough + // so that it's not jittering (CPU can be volatile). + // OTOH it should be short enough to get up-to-date recent info. + cacheDelay = 20 * time.Second +) + +// CPUInfo contains basic CPU info +type CPUInfo struct { + // UserAvg is the average of the user CPU usage since last time + // it was polled. 0 means "not used at all" and 1 means "1 CPU was + // totally full for that period". So it might be greater than 1 if + // the process is monopolizing several cores. + UserAvg float64 +} + +// MemInfo contains basic memory info +type MemInfo struct { + // Alloc is the number of bytes allocated and not yet freed + // as described in runtime.MemStats.Alloc + Alloc uint64 + // AllocPerSec is the average number of bytes allocated, per second, + // since last time this function was called. + AllocPerSec float64 +} + +// NetInfo contains basic networking info +type NetInfo struct { + // Connections is the number of connections opened by this process. + Connections int32 +} + +// Info contains all the watchdog infos, to be published by expvar +type Info struct { + // CPU contains basic CPU info + CPU CPUInfo + // Mem contains basic Mem info + Mem MemInfo + // Net contains basic Net info + Net NetInfo +} + +// CurrentInfo is used to query CPU and Mem info, it keeps data from +// the previous calls to calculate averages. It is not thread safe. +type CurrentInfo struct { + p *process.Process + mu sync.Mutex + cacheDelay time.Duration + + lastCPUTime time.Time + lastCPUUser float64 + lastCPU CPUInfo + + lastMemTime time.Time + lastMemTotalAlloc uint64 + lastMem MemInfo + + lastNetTime time.Time + lastNet NetInfo +} + +// globalCurrentInfo is a global default object one can safely use +// if only one goroutine is polling for CPU() and Mem() +var globalCurrentInfo *CurrentInfo + +func init() { + var err error + globalCurrentInfo, err = NewCurrentInfo() + if err != nil { + log.Errorf("unable to create global Process: %v", err) + } +} + +// NewCurrentInfo creates a new CurrentInfo referring to the current running program. +func NewCurrentInfo() (*CurrentInfo, error) { + p, err := process.NewProcess(int32(os.Getpid())) + if err != nil { + return nil, err + } + return &CurrentInfo{ + p: p, + cacheDelay: cacheDelay, + }, nil +} + +// CPU returns basic CPU info. +func (pi *CurrentInfo) CPU() CPUInfo { + pi.mu.Lock() + defer pi.mu.Unlock() + + now := time.Now() + dt := now.Sub(pi.lastCPUTime) + if dt <= pi.cacheDelay { + return pi.lastCPU // don't query too often, cache a little bit + } + pi.lastCPUTime = now + + times, err := pi.p.Times() + if err != nil { + log.Debugf("unable to get CPU times: %v", err) + return pi.lastCPU + } + + dua := times.User - pi.lastCPUUser + pi.lastCPUUser = times.User + if dua <= 0 { + pi.lastCPU.UserAvg = 0 // shouldn't happen, but make sure result is always > 0 + } else { + pi.lastCPU.UserAvg = float64(time.Second) * dua / float64(dt) + pi.lastCPUUser = times.User + } + + return pi.lastCPU +} + +// Mem returns basic memory info. +func (pi *CurrentInfo) Mem() MemInfo { + pi.mu.Lock() + defer pi.mu.Unlock() + + now := time.Now() + dt := now.Sub(pi.lastMemTime) + if dt <= pi.cacheDelay { + return pi.lastMem // don't query too often, cache a little bit + } + pi.lastMemTime = now + + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + ret := MemInfo{Alloc: ms.Alloc, AllocPerSec: pi.lastMem.AllocPerSec} + + dta := int64(ms.TotalAlloc) - int64(pi.lastMemTotalAlloc) + pi.lastMemTotalAlloc = ms.TotalAlloc + if dta <= 0 { + pi.lastMem.AllocPerSec = 0 // shouldn't happen, but make sure result is always > 0 + } else { + pi.lastMem.AllocPerSec = float64(time.Second) * float64(dta) / float64(dt) + } + ret.AllocPerSec = pi.lastMem.AllocPerSec + + return ret +} + +// CPU returns basic CPU info. +func CPU() CPUInfo { + if globalCurrentInfo == nil { + return CPUInfo{} + } + return globalCurrentInfo.CPU() +} + +// Mem returns basic memory info. +func Mem() MemInfo { + if globalCurrentInfo == nil { + return MemInfo{} + } + return globalCurrentInfo.Mem() +} + +// Net returns basic network info. +func Net() NetInfo { + if globalCurrentInfo == nil { + return NetInfo{} + } + return globalCurrentInfo.Net() +} diff --git a/pkg/trace/watchdog/info_test.go b/pkg/trace/watchdog/info_test.go new file mode 100644 index 0000000000000..eda4219a48829 --- /dev/null +++ b/pkg/trace/watchdog/info_test.go @@ -0,0 +1,246 @@ +package watchdog + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "runtime" + "testing" + "time" + + "github.com/shirou/gopsutil/process" + "github.com/stretchr/testify/assert" +) + +const ( + testDuration = time.Second +) + +func TestCPULow(t *testing.T) { + assert := assert.New(t) + runtime.GC() + + c := CPU() + globalCurrentInfo.cacheDelay = testDuration + time.Sleep(testDuration) + c = CPU() + t.Logf("CPU (sleep): %v", c) + + // checking that CPU is low enough, this is theorically flaky, + // but eating 50% of CPU for a time.Sleep is still not likely to happen often + assert.Condition(func() bool { return c.UserAvg >= 0.0 }, fmt.Sprintf("cpu avg should be positive, got %f", c.UserAvg)) + assert.Condition(func() bool { return c.UserAvg <= 0.5 }, fmt.Sprintf("cpu avg should be below 0.5, got %f", c.UserAvg)) +} + +func doTestCPUHigh(t *testing.T, n int) { + assert := assert.New(t) + runtime.GC() + + done := make(chan struct{}, 1) + c := CPU() + globalCurrentInfo.cacheDelay = testDuration + for i := 0; i < n; i++ { + go func() { + j := 0 + for { + select { + case <-done: + return + default: + j++ + } + } + }() + } + time.Sleep(testDuration) + c = CPU() + for i := 0; i < n; i++ { + done <- struct{}{} + } + t.Logf("CPU (%d goroutines): %v", n, c) + + // Checking that CPU is high enough, a very simple ++ loop should be + // enough to stimulate one core and make it over 50%. One of the goals + // of this test is to check that values are not wrong by a factor 100, such + // as mismatching percentages and [0...1] values. + assert.Condition(func() bool { return c.UserAvg >= 0.5 }, fmt.Sprintf("cpu avg is too low, got %f", c.UserAvg)) + assert.Condition(func() bool { return c.UserAvg <= float64(n+1) }, fmt.Sprintf("cpu avg is too high, target is %d, got %f", n, c.UserAvg)) +} + +func TestCPUHigh(t *testing.T) { + doTestCPUHigh(t, 1) + if testing.Short() { + return + } + doTestCPUHigh(t, 10) + doTestCPUHigh(t, 100) +} + +func TestMemLow(t *testing.T) { + assert := assert.New(t) + runtime.GC() + + oldM := Mem() + globalCurrentInfo.cacheDelay = testDuration + time.Sleep(testDuration) + m := Mem() + t.Logf("Mem (sleep): %v", m) + + // Checking that Mem is low enough, this is theorically flaky, + // unless some other random GoRoutine is running, figures should remain low + assert.True(int64(m.Alloc)-int64(oldM.Alloc) <= 1e4, "over 10 Kb allocated since last call, way to high for almost no operation") + assert.True(m.Alloc <= 1e8, "over 100 Mb allocated, way to high for almost no operation") + assert.True(m.AllocPerSec >= 0.0, "allocs per sec should be positive") + assert.True(m.AllocPerSec <= 1e5, "over 100 Kb allocated per sec, way too high for a program doing nothing") +} + +func doTestMemHigh(t *testing.T, n int) { + assert := assert.New(t) + runtime.GC() + + done := make(chan struct{}, 1) + data := make(chan []byte, 1) + oldM := Mem() + globalCurrentInfo.cacheDelay = testDuration + go func() { + a := make([]byte, n) + a[0] = 1 + a[n-1] = 1 + data <- a + select { + case <-done: + } + }() + time.Sleep(testDuration) + m := Mem() + done <- struct{}{} + + t.Logf("Mem (%d bytes): %v", n, m) + + // Checking that Mem is high enough + assert.True(m.Alloc >= uint64(n), "not enough bytes allocated") + assert.True(int64(m.Alloc)-int64(oldM.Alloc) >= int64(n), "not enough bytes allocated since last call") + expectedAllocPerSec := float64(n) * float64(time.Second) / (float64(testDuration)) + assert.True(m.AllocPerSec >= 0.1*expectedAllocPerSec, fmt.Sprintf("not enough bytes allocated per second, expected %f", expectedAllocPerSec)) + assert.True(m.AllocPerSec <= 1.5*expectedAllocPerSec, fmt.Sprintf("not enough bytes allocated per second, expected %f", expectedAllocPerSec)) + <-data +} + +func TestMemHigh(t *testing.T) { + doTestMemHigh(t, 1e5) + if testing.Short() { + return + } + doTestMemHigh(t, 1e7) +} + +type testNetHandler struct { + t *testing.T +} + +func (h *testNetHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + r.Body.Close() + h.t.Logf("request") +} + +func newTestNetServer(t *testing.T) *httptest.Server { + assert := assert.New(t) + server := httptest.NewServer(&testNetHandler{t: t}) + assert.NotNil(server) + t.Logf("server on %v", server.URL) + return server +} + +func doTestNetHigh(t *testing.T, n int) { + assert := assert.New(t) + runtime.GC() + + servers := make([]*httptest.Server, n) + for i := range servers { + servers[i] = newTestNetServer(t) + } + time.Sleep(testDuration) + info := Net() + t.Logf("Net: %v", info) + for _, v := range servers { + v.Close() + } + + // Checking that Net connections number is in a reasonable range + assert.True(info.Connections >= int32(n/2), fmt.Sprintf("not enough connections %d < %d / 2", info.Connections, n)) + assert.True(info.Connections <= int32(n*3), fmt.Sprintf("not enough connections %d > %d * 3", info.Connections, n)) +} + +func TestNetHigh(t *testing.T) { + doTestNetHigh(t, 10) + if testing.Short() { + return + } + doTestNetHigh(t, 200) +} + +func TestNetLow(t *testing.T) { + assert := assert.New(t) + runtime.GC() + + time.Sleep(testDuration) + info := Net() + t.Logf("Net: %v", info) + + // Checking that Net connections number is low enough, this is theorically flaky, + // unless some other random GoRoutine is running, figures should remain low + assert.True(int32(info.Connections) <= 10, "over 10 connections open when we're doing nothing, way too high") +} + +func BenchmarkCPU(b *testing.B) { + CPU() // make sure globalCurrentInfo exists + globalCurrentInfo.cacheDelay = 0 // disable cache + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = CPU() + } +} + +func BenchmarkMem(b *testing.B) { + Mem() // make sure globalCurrentInfo exists + globalCurrentInfo.cacheDelay = 0 // disable cache + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = Mem() + } +} + +func BenchmarkNet(b *testing.B) { + Net() // make sure globalCurrentInfo exists + globalCurrentInfo.cacheDelay = 0 // disable cache + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = Net() + } +} + +func BenchmarkCPUTimes(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + p, err := process.NewProcess(int32(os.Getpid())) + if err != nil { + b.Fatalf("unable to create Process: %v", err) + } + for i := 0; i < b.N; i++ { + _, _ = p.Times() + } +} + +func BenchmarkReadMemStats(b *testing.B) { + var ms runtime.MemStats + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + runtime.ReadMemStats(&ms) + } +} diff --git a/pkg/trace/watchdog/logonpanic.go b/pkg/trace/watchdog/logonpanic.go new file mode 100644 index 0000000000000..da113fdc2d5ca --- /dev/null +++ b/pkg/trace/watchdog/logonpanic.go @@ -0,0 +1,43 @@ +package watchdog + +import ( + "fmt" + "runtime" + + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + log "github.com/cihub/seelog" +) + +const shortErrMsgLen = 17 // 20 char max with tailing "..." + +// shortMsg shortens the length of error message to avoid having high +// cardinality on "err:" tags +func shortErrMsg(msg string) string { + if len(msg) <= shortErrMsgLen { + return msg + } + return msg[:shortErrMsgLen] + "..." +} + +// LogOnPanic catches panics and logs them on the fly. It also flushes +// the log file, ensuring the message appears. Then it propagates the panic +// so that the program flow remains unchanged. +func LogOnPanic() { + if err := recover(); err != nil { + // Full print of the trace in the logs + buf := make([]byte, 4096) + length := runtime.Stack(buf, false) + stacktrace := string(buf[:length]) + errMsg := fmt.Sprintf("%v", err) + logMsg := "Unexpected panic: " + errMsg + "\n" + stacktrace + + metrics.Gauge("datadog.trace_agent.panic", 1, []string{ + "err:" + shortErrMsg(errMsg), + }, 1) + + log.Error(logMsg) + log.Flush() + + panic(err) + } +} diff --git a/pkg/trace/watchdog/logonpanic_test.go b/pkg/trace/watchdog/logonpanic_test.go new file mode 100644 index 0000000000000..f251e10f5b999 --- /dev/null +++ b/pkg/trace/watchdog/logonpanic_test.go @@ -0,0 +1,100 @@ +package watchdog + +import ( + "bytes" + "fmt" + "sync" + "testing" + + log "github.com/cihub/seelog" + "github.com/stretchr/testify/assert" +) + +var testLogBuf bytes.Buffer + +func init() { + logger, err := log.LoggerFromWriterWithMinLevelAndFormat(&testLogBuf, log.DebugLvl, "%Ns [%Level] %Msg") + if err != nil { + panic(err) + } + err = log.ReplaceLogger(logger) + if err != nil { + panic(err) + } +} + +func TestLogOnPanicMain(t *testing.T) { + assert := assert.New(t) + + defer func() { + r := recover() + assert.NotNil(r, "panic should bubble up and be trapped here") + assert.Contains(fmt.Sprintf("%v", r), + "integer divide by zero", + "divide by zero panic should be forwarded") + msg := testLogBuf.String() + assert.Contains(msg, + "Unexpected panic: runtime error: integer divide by zero", + "divide by zero panic should be reported in log") + assert.Contains(msg, + "github.com/DataDog/datadog-agent/pkg/trace/watchdog.TestLogOnPanicMain", + "log should contain a reference to this test func name as it displays the stack trace") + }() + defer LogOnPanic() + zero := 0 + _ = 1 / zero +} + +func TestLogOnPanicGoroutine(t *testing.T) { + assert := assert.New(t) + + var wg sync.WaitGroup + wg.Add(1) + + go func() { + defer func() { + r := recover() + assert.NotNil(r, "panic should bubble up and be trapped here") + assert.Contains(fmt.Sprintf("%v", r), + "what could possibly go wrong?", + "custom panic should be forwarded") + msg := testLogBuf.String() + assert.Contains(msg, + "Unexpected panic: what could possibly go wrong?", + "custom panic should be reported in log") + assert.Contains(msg, + "github.com/DataDog/datadog-agent/pkg/trace/watchdog.TestLogOnPanicGoroutine", + "log should contain a reference to this test func name as it displays the stack trace") + wg.Done() + }() + defer LogOnPanic() + panic("what could possibly go wrong?") + }() + defer func() { + r := recover() + assert.Nil(r, "this should trap no error at all, what we demonstrate here is that recover needs to be called on a per-goroutine base") + }() + wg.Wait() +} + +func TestShortErrMsg(t *testing.T) { + assert := assert.New(t) + + expected := map[string]string{ + "exceeded max connections": "exceeded max conn...", + "cannot configure dogstatsd": "cannot configure ...", + "ooops": "ooops", + "0123456789abcdef": "0123456789abcdef", + "0123456789abcdef0": "0123456789abcdef0", + "0123456789abcdef01": "0123456789abcdef0...", + "0123456789abcdef012": "0123456789abcdef0...", + "0123456789abcdef0123": "0123456789abcdef0...", + "0123456789abcdef01234": "0123456789abcdef0...", + "": "", + "αβγ": "αβγ", + } + + for k, v := range expected { + assert.Equal(v, shortErrMsg(k), "short error message for '%s' should be '%s'", k, v) + } +} diff --git a/pkg/trace/watchdog/net.go b/pkg/trace/watchdog/net.go new file mode 100644 index 0000000000000..3f29371bd5c37 --- /dev/null +++ b/pkg/trace/watchdog/net.go @@ -0,0 +1,33 @@ +// +build !windows + +package watchdog + +import ( + log "github.com/cihub/seelog" + "github.com/shirou/gopsutil/net" + "os" + "time" +) + +// Net returns basic network info. +func (pi *CurrentInfo) Net() NetInfo { + pi.mu.Lock() + defer pi.mu.Unlock() + + now := time.Now() + dt := now.Sub(pi.lastNetTime) + if dt <= pi.cacheDelay { + return pi.lastNet // don't query too often, cache a little bit + } + pi.lastNetTime = now + + connections, err := net.ConnectionsPid("tcp", int32(os.Getpid())) + if err != nil { + log.Debugf("unable to get Net connections: %v", err) + return pi.lastNet + } + + pi.lastNet.Connections = int32(len(connections)) + + return pi.lastNet +} diff --git a/pkg/trace/watchdog/net_windows.go b/pkg/trace/watchdog/net_windows.go new file mode 100644 index 0000000000000..e305d5c2a2e81 --- /dev/null +++ b/pkg/trace/watchdog/net_windows.go @@ -0,0 +1,6 @@ +package watchdog + +// Net for windows returns basic network info without the number of connections. +func (pi *CurrentInfo) Net() NetInfo { + return NetInfo{} +} diff --git a/pkg/trace/writer/backoff/backoff.go b/pkg/trace/writer/backoff/backoff.go new file mode 100644 index 0000000000000..a2eee4cb25d48 --- /dev/null +++ b/pkg/trace/writer/backoff/backoff.go @@ -0,0 +1,84 @@ +package backoff + +import "time" + +// Timer represents a timer that implements some backOff strategy that can adapt to number of schedulings. +type Timer interface { + ScheduleRetry(err error) (int, time.Duration) + CurrentDelay() time.Duration + NumRetries() int + ReceiveTick() <-chan time.Time + Reset() + Stop() +} + +// DelayProvider is a function that takes the current numRetries and last error and returns the delay until next retry. +type DelayProvider func(numRetries int, err error) time.Duration + +// CustomTimer represents a backoff timer configured with a certain DelayProvider. +type CustomTimer struct { + numRetries int + currentDelay time.Duration + + delayProvider DelayProvider + + tickChannel chan time.Time + timer *time.Timer +} + +// NewCustomTimer creates a new custom timer using the provided delay provider. +func NewCustomTimer(delayProvider DelayProvider) *CustomTimer { + return &CustomTimer{ + delayProvider: delayProvider, + tickChannel: make(chan time.Time), + } +} + +// ScheduleRetry schedules the next retry tick according to the delay provider, returning retry num and retry delay. +func (t *CustomTimer) ScheduleRetry(err error) (int, time.Duration) { + t.Stop() + t.currentDelay = t.delayProvider(t.numRetries, err) + + t.timer = time.AfterFunc(t.currentDelay, func() { + t.tickChannel <- time.Now() + }) + + t.numRetries++ + + return t.numRetries, t.currentDelay +} + +// CurrentDelay returns the delay of the current or last ticked retry. +func (t *CustomTimer) CurrentDelay() time.Duration { + return t.currentDelay +} + +// NumRetries returns the number of tries since this timer was last reset. +func (t *CustomTimer) NumRetries() int { + return t.numRetries +} + +// ReceiveTick returns a channel that will receive a time.Time object as soon as the previously scheduled retry ticks. +func (t *CustomTimer) ReceiveTick() <-chan time.Time { + return t.tickChannel +} + +// Reset stops and resets the number of retries counter of this timer. +func (t *CustomTimer) Reset() { + t.Stop() + t.numRetries = 0 + t.currentDelay = 0 +} + +// Stop prevents any current scheduled retry from ticking. +func (t *CustomTimer) Stop() { + if t.timer != nil { + t.timer.Stop() + } +} + +// Close cleans up the resources used by this timer. It cannot be reused after this call. +func (t *CustomTimer) Close() { + t.Reset() + close(t.tickChannel) +} diff --git a/pkg/trace/writer/backoff/backoff_test.go b/pkg/trace/writer/backoff/backoff_test.go new file mode 100644 index 0000000000000..477d7dbda575e --- /dev/null +++ b/pkg/trace/writer/backoff/backoff_test.go @@ -0,0 +1,117 @@ +package backoff + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +type SpecialError struct{} + +func (*SpecialError) Error() string { + return "this is a very special error" +} + +func TestCustomTimer_ScheduleRetry(t *testing.T) { + assert := assert.New(t) + + testDelay := 200 * time.Millisecond + + timer := NewCustomTimer(func(numRetries int, err error) time.Duration { + if _, ok := err.(*SpecialError); ok { + // If special error use fixed delay of 100 ms + return 100 * time.Millisecond + } + + // If normal error (or nil) + return time.Duration(int64(1+numRetries) * int64(testDelay)) + }) + + // First schedule (numRetries == 0) + callTime := time.Now() + timer.ScheduleRetry(nil) + assert.Equal(testDelay, timer.CurrentDelay(), "Timer should report correct retry delay") + + select { + case tickTime := <-timer.ReceiveTick(): + assert.WithinDuration(tickTime, callTime, time.Duration(1.5*float64(testDelay)), + "Tick time and call time should be within expected delay of each other (with a small margin)") + case <-time.After(1 * time.Second): + assert.Fail("Received no tick within 500ms") + } + + // Second schedule (numRetries == 1) + callTime = time.Now() + timer.ScheduleRetry(nil) + assert.Equal(time.Duration(2*testDelay), timer.CurrentDelay(), "Timer should report correct retry delay") + + select { + case tickTime := <-timer.ReceiveTick(): + assert.WithinDuration(tickTime, callTime, time.Duration(2.5*float64(testDelay)), + "Tick time and call time should be within expected delay of each other (with a small margin)") + case <-time.After(1 * time.Second): + assert.Fail("Received no tick within 500ms") + } + + // Third schedule (numRetries == 2 but error is SpecialError) + callTime = time.Now() + timer.ScheduleRetry(&SpecialError{}) + assert.Equal(100*time.Millisecond, timer.CurrentDelay(), "Timer should report correct retry delay") + + select { + case tickTime := <-timer.ReceiveTick(): + assert.WithinDuration(tickTime, callTime, time.Duration(200*time.Millisecond), + "Tick time and call time should be within expected delay of each other (with a small margin)") + case <-time.After(1 * time.Second): + assert.Fail("Received no tick within 500ms") + } + + timer.Close() +} + +func TestCustomTimer_StopNotTicked(t *testing.T) { + assert := assert.New(t) + + testDelay := 100 * time.Millisecond + + timer := NewCustomTimer(func(_ int, _ error) time.Duration { return testDelay }) + + timer.ScheduleRetry(nil) + timer.Stop() + + select { + case <-timer.ReceiveTick(): + assert.Fail("Shouldn't have received tick because timer was stopped") + case <-time.After(2 * testDelay): + assert.True(true, "Should end without receiving anything") + } + + assert.Equal(1, timer.NumRetries(), "Stopping the timer should not have reset it") + assert.Equal(testDelay, timer.CurrentDelay(), "Stopping the timer should not have reset it") + + timer.Close() +} + +func TestCustomTimer_Reset(t *testing.T) { + assert := assert.New(t) + + testDelay := 100 * time.Millisecond + + timer := NewCustomTimer(func(_ int, _ error) time.Duration { return testDelay }) + + timer.ScheduleRetry(nil) + timer.Reset() + + select { + case <-timer.ReceiveTick(): + assert.Fail("Shouldn't have received tick because resetting a timer should also stop it") + case <-time.After(2 * testDelay): + assert.True(true, "Should end without receiving anything") + } + + assert.Equal(0, timer.NumRetries(), "Timer should have been reset") + assert.Equal(0*time.Millisecond, timer.CurrentDelay(), "Timer should have been reset") + + timer.Close() +} diff --git a/pkg/trace/writer/backoff/exponential.go b/pkg/trace/writer/backoff/exponential.go new file mode 100644 index 0000000000000..6e279abb17bd0 --- /dev/null +++ b/pkg/trace/writer/backoff/exponential.go @@ -0,0 +1,79 @@ +package backoff + +import ( + "math" + "math/rand" + "time" +) + +// ExponentialConfig holds the parameters used by the ExponentialTimer. +type ExponentialConfig struct { + MaxDuration time.Duration + GrowthBase int + Base time.Duration +} + +// DefaultExponentialConfig creates an ExponentialConfig with default values. +func DefaultExponentialConfig() ExponentialConfig { + return ExponentialConfig{ + MaxDuration: 120 * time.Second, + GrowthBase: 2, + Base: 200 * time.Millisecond, + } +} + +// DefaultExponentialDelayProvider creates a new instance of an ExponentialDelayProvider using the default config. +func DefaultExponentialDelayProvider() DelayProvider { + return ExponentialDelayProvider(DefaultExponentialConfig()) +} + +// ExponentialDelayProvider creates a new instance of an ExponentialDelayProvider using the provided config. +func ExponentialDelayProvider(conf ExponentialConfig) DelayProvider { + return exponentialDelayProviderCustomRandom(conf, rand.New(rand.NewSource(time.Now().UnixNano()))) +} + +// exponentialDelayProviderCustomRandom creates a new instance of ExponentialDelayProvider using the provided config +// and random number generator. +func exponentialDelayProviderCustomRandom(conf ExponentialConfig, rand *rand.Rand) DelayProvider { + return func(numRetries int, _ error) time.Duration { + pow := math.Pow(float64(conf.GrowthBase), float64(numRetries)) + + // Correctly handle overflowing pow + if pow < 0 || pow > math.MaxInt64 { + pow = math.MaxInt64 + } + + mul := int64(pow) * int64(conf.Base) + + // Correctly handle overflowing mul + if pow != 0 && mul/int64(pow) != int64(conf.Base) { + mul = math.MaxInt64 + } + + newExpDuration := time.Duration(mul) + + if newExpDuration > conf.MaxDuration { + newExpDuration = conf.MaxDuration + } + + return time.Duration(rand.Int63n(int64(newExpDuration))) + } +} + +// ExponentialTimer performs an exponential backoff following the FullJitter implementation described in +// https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ +type ExponentialTimer struct { + CustomTimer +} + +// NewExponentialTimer creates an exponential backoff timer using the default configuration. +func NewExponentialTimer() *ExponentialTimer { + return NewCustomExponentialTimer(DefaultExponentialConfig()) +} + +// NewCustomExponentialTimer creates an exponential backoff timer using the provided configuration. +func NewCustomExponentialTimer(conf ExponentialConfig) *ExponentialTimer { + return &ExponentialTimer{ + CustomTimer: *NewCustomTimer(ExponentialDelayProvider(conf)), + } +} diff --git a/pkg/trace/writer/backoff/exponential_test.go b/pkg/trace/writer/backoff/exponential_test.go new file mode 100644 index 0000000000000..6cd9895aea466 --- /dev/null +++ b/pkg/trace/writer/backoff/exponential_test.go @@ -0,0 +1,92 @@ +package backoff + +import ( + "fmt" + "math" + "math/rand" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +var errBogus = fmt.Errorf("bogus error") + +func TestDefaultRandomSeed(t *testing.T) { + assert := assert.New(t) + + delayProvider1 := DefaultExponentialDelayProvider() + delayProvider2 := DefaultExponentialDelayProvider() + + // Ensure different timers are not synchronized in their backoffing (use different seeds) + assert.NotEqual(delayProvider1(0, nil), delayProvider2(0, nil)) +} + +func TestExponentialDelay(t *testing.T) { + assert := assert.New(t) + + conf := ExponentialConfig{ + // Use nanoseconds to reduce universe from which randoms are chosen. Seconds should be the same, just scaled. + MaxDuration: 120 * time.Nanosecond, + GrowthBase: 2, + Base: time.Nanosecond, + } + + // Use fixed random to prevent flakiness in case the CI has very bad luck + delayProvider := exponentialDelayProviderCustomRandom(conf, rand.New(rand.NewSource(1234))) + + prevMax := int64(0) + + // Try successive calls to delayProvider with increasing numRetries (from 0 to 19). + for i := 0; i < 20; i++ { + expectedMax := int64(math.Pow(2, float64(i))) + + if expectedMax > int64(conf.MaxDuration) { + expectedMax = int64(conf.MaxDuration) + } + + // For each value of numRetries, get min and max value we saw over 500 calls + min, max := minMaxForSample(delayProvider, 500, i) + + assert.True(max <= expectedMax, "Max should be lower or equal to expected max. Max: %d, expected: %d", max, + expectedMax) + assert.True(max >= prevMax, "Max should grow because this is exp. backoff. Current: %d, prev: %d", + max, prevMax) + assert.True(min <= max/2, "Minimum should be 'far' from max since this should be jittery. Min: %d, max: %d", + min, max) + + prevMax = max + } +} + +func TestExponentialOverflow(t *testing.T) { + assert := assert.New(t) + + delayProvider := DefaultExponentialDelayProvider() + + assert.NotPanics(func() { + min, max := minMaxForSample(delayProvider, 300, 1024) + + assert.True(min >= 0, "Min should be greater or equal to 0") + assert.True(max <= int64(DefaultExponentialConfig().MaxDuration), "Min should be greater or equal to 0") + }) +} + +func minMaxForSample(delayProvider DelayProvider, n int, numTries int) (min, max int64) { + max = 0 + min = math.MaxInt64 + + for i := 0; i < n; i++ { + delay := int64(delayProvider(numTries, nil)) + + if delay > max { + max = delay + } + + if delay < min { + min = delay + } + } + + return +} diff --git a/pkg/trace/writer/config/payload.go b/pkg/trace/writer/config/payload.go new file mode 100644 index 0000000000000..52304e8b4c6fc --- /dev/null +++ b/pkg/trace/writer/config/payload.go @@ -0,0 +1,25 @@ +package config + +import ( + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/writer/backoff" +) + +// QueuablePayloadSenderConf contains the configuration needed by a QueuablePayloadSender to operate. +type QueuablePayloadSenderConf struct { + MaxAge time.Duration + MaxQueuedBytes int64 + MaxQueuedPayloads int + ExponentialBackoff backoff.ExponentialConfig +} + +// DefaultQueuablePayloadSenderConf constructs a QueuablePayloadSenderConf with default sane options. +func DefaultQueuablePayloadSenderConf() QueuablePayloadSenderConf { + return QueuablePayloadSenderConf{ + MaxAge: 20 * time.Minute, + MaxQueuedBytes: 64 * 1024 * 1024, // 64 MB + MaxQueuedPayloads: -1, // Unlimited + ExponentialBackoff: backoff.DefaultExponentialConfig(), + } +} diff --git a/pkg/trace/writer/config/service_writer.go b/pkg/trace/writer/config/service_writer.go new file mode 100644 index 0000000000000..7007c9af5a231 --- /dev/null +++ b/pkg/trace/writer/config/service_writer.go @@ -0,0 +1,21 @@ +package config + +import ( + "time" +) + +// ServiceWriterConfig contains the configuration to customize the behaviour of a ServiceWriter. +type ServiceWriterConfig struct { + FlushPeriod time.Duration + UpdateInfoPeriod time.Duration + SenderConfig QueuablePayloadSenderConf +} + +// DefaultServiceWriterConfig creates a new instance of a ServiceWriterConfig using default values. +func DefaultServiceWriterConfig() ServiceWriterConfig { + return ServiceWriterConfig{ + FlushPeriod: 5 * time.Second, + UpdateInfoPeriod: 1 * time.Minute, + SenderConfig: DefaultQueuablePayloadSenderConf(), + } +} diff --git a/pkg/trace/writer/config/stats_writer.go b/pkg/trace/writer/config/stats_writer.go new file mode 100644 index 0000000000000..6743b64d7f9c9 --- /dev/null +++ b/pkg/trace/writer/config/stats_writer.go @@ -0,0 +1,26 @@ +package config + +import "time" + +// maxEntriesPerPayload is the maximum number of entries in a stat payload. An +// entry has an average size of 125 bytes in a compressed payload. The current +// Datadog intake API limits a compressed payload to ~3MB (24,000 entries), but +// let's have the default ensure we don't have paylods > 1.5 MB (12,000 +// entries). +const maxEntriesPerPayload = 12000 + +// StatsWriterConfig contains the configuration to customize the behaviour of a TraceWriter. +type StatsWriterConfig struct { + MaxEntriesPerPayload int + UpdateInfoPeriod time.Duration + SenderConfig QueuablePayloadSenderConf +} + +// DefaultStatsWriterConfig creates a new instance of a StatsWriterConfig using default values. +func DefaultStatsWriterConfig() StatsWriterConfig { + return StatsWriterConfig{ + MaxEntriesPerPayload: maxEntriesPerPayload, + UpdateInfoPeriod: 1 * time.Minute, + SenderConfig: DefaultQueuablePayloadSenderConf(), + } +} diff --git a/pkg/trace/writer/config/trace_writer.go b/pkg/trace/writer/config/trace_writer.go new file mode 100644 index 0000000000000..61e6896a5e8b3 --- /dev/null +++ b/pkg/trace/writer/config/trace_writer.go @@ -0,0 +1,21 @@ +package config + +import "time" + +// TraceWriterConfig contains the configuration to customize the behaviour of a TraceWriter. +type TraceWriterConfig struct { + MaxSpansPerPayload int + FlushPeriod time.Duration + UpdateInfoPeriod time.Duration + SenderConfig QueuablePayloadSenderConf +} + +// DefaultTraceWriterConfig creates a new instance of a TraceWriterConfig using default values. +func DefaultTraceWriterConfig() TraceWriterConfig { + return TraceWriterConfig{ + MaxSpansPerPayload: 1000, + FlushPeriod: 5 * time.Second, + UpdateInfoPeriod: 1 * time.Minute, + SenderConfig: DefaultQueuablePayloadSenderConf(), + } +} diff --git a/pkg/trace/writer/endpoint.go b/pkg/trace/writer/endpoint.go new file mode 100644 index 0000000000000..19739ea8ddda1 --- /dev/null +++ b/pkg/trace/writer/endpoint.go @@ -0,0 +1,175 @@ +package writer + +import ( + "bytes" + "crypto/tls" + "errors" + "fmt" + "net" + "net/http" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + log "github.com/cihub/seelog" +) + +const languageHeaderKey = "X-Datadog-Reported-Languages" + +// endpoint is an interface where we send the data from the Agent. +type endpoint interface { + // Write writes the payload to the endpoint. + write(payload *payload) error + + // baseURL returns the base URL for this endpoint. e.g. For the URL "https://trace.agent.datadoghq.eu/api/v0.2/traces" + // it returns "https://trace.agent.datadoghq.eu". + baseURL() string +} + +// nullEndpoint is a void endpoint dropping data. +type nullEndpoint struct{} + +// Write of nullEndpoint just drops the payload and log its size. +func (ne *nullEndpoint) write(payload *payload) error { + log.Debug("null endpoint: dropping payload, size: %d", len(payload.bytes)) + return nil +} + +// BaseURL implements Endpoint. +func (ne *nullEndpoint) baseURL() string { return "" } + +// retriableError is an endpoint error that signifies that the associated operation can be retried at a later point. +type retriableError struct { + err error + endpoint endpoint +} + +// Error returns the error string. +func (re *retriableError) Error() string { + return fmt.Sprintf("%s: %v", re.endpoint, re.err) +} + +const ( + userAgentPrefix = "Datadog Trace Agent" + userAgentSupportURL = "https://github.com/DataDog/datadog-trace-agent" +) + +// userAgent is the computed user agent we'll use when +// communicating with Datadog +var userAgent = fmt.Sprintf( + "%s/%s/%s (+%s)", + userAgentPrefix, info.Version, info.GitCommit, userAgentSupportURL, +) + +// datadogEndpoint sends payloads to Datadog API. +type datadogEndpoint struct { + apiKey string + host string + client *http.Client + path string +} + +// NewEndpoints returns the set of endpoints configured in the AgentConfig, appending the given path. +// The first endpoint is the main API endpoint, followed by any additional endpoints. +func newEndpoints(conf *config.AgentConfig, path string) []endpoint { + if !conf.Enabled { + log.Info("API interface is disabled, flushing to /dev/null instead") + return []endpoint{&nullEndpoint{}} + } + if e := conf.Endpoints; len(e) == 0 || e[0].Host == "" || e[0].APIKey == "" { + panic(errors.New("must have at least one endpoint with key")) + } + endpoints := make([]endpoint, len(conf.Endpoints)) + ignoreProxy := true + client := newClient(conf, !ignoreProxy) + clientIgnoreProxy := newClient(conf, ignoreProxy) + for i, e := range conf.Endpoints { + c := client + if e.NoProxy { + c = clientIgnoreProxy + } + endpoints[i] = &datadogEndpoint{ + apiKey: e.APIKey, + host: e.Host, + path: path, + client: c, + } + } + return endpoints +} + +// baseURL implements Endpoint. +func (e *datadogEndpoint) baseURL() string { return e.host } + +// write will send the serialized traces payload to the Datadog traces endpoint. +func (e *datadogEndpoint) write(payload *payload) error { + // Create the request to be sent to the API + url := e.host + e.path + req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload.bytes)) + if err != nil { + return err + } + + req.Header.Set("DD-Api-Key", e.apiKey) + req.Header.Set("User-Agent", userAgent) + for key, value := range payload.headers { + req.Header.Set(key, value) + } + + resp, err := e.client.Do(req) + + if err != nil { + return &retriableError{ + err: err, + endpoint: e, + } + } + defer resp.Body.Close() + + // We check the status code to see if the request has succeeded. + // TODO: define all legit status code and behave accordingly. + if resp.StatusCode/100 != 2 { + err := fmt.Errorf("request to %s responded with %s", url, resp.Status) + if resp.StatusCode/100 == 5 { + // 5xx errors are retriable + return &retriableError{ + err: err, + endpoint: e, + } + } + + // All others aren't + return err + } + + // Everything went fine + return nil +} + +func (e *datadogEndpoint) String() string { + return fmt.Sprintf("DataDogEndpoint(%q)", e.host+e.path) +} + +// timeout is the HTTP timeout for POST requests to the Datadog backend +const timeout = 10 * time.Second + +// newClient returns a http.Client configured with the Agent options. +func newClient(conf *config.AgentConfig, ignoreProxy bool) *http.Client { + transport := &http.Transport{ + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + DualStack: true, + }).DialContext, + MaxIdleConns: 100, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + TLSClientConfig: &tls.Config{InsecureSkipVerify: conf.SkipSSLValidation}, + } + if conf.ProxyURL != nil && !ignoreProxy { + log.Infof("configuring proxy through: %s", conf.ProxyURL.String()) + transport.Proxy = http.ProxyURL(conf.ProxyURL) + } + return &http.Client{Timeout: timeout, Transport: transport} +} diff --git a/pkg/trace/writer/endpoint_test.go b/pkg/trace/writer/endpoint_test.go new file mode 100644 index 0000000000000..815b8be807a86 --- /dev/null +++ b/pkg/trace/writer/endpoint_test.go @@ -0,0 +1,146 @@ +package writer + +import ( + "net/http" + "net/url" + "testing" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/stretchr/testify/assert" +) + +func TestNewClient(t *testing.T) { + assert := assert.New(t) + url, err := url.Parse("test_url") + if err != nil { + t.Fatal(err) + } + + t.Run("blank", func(t *testing.T) { + client := newClient(&config.AgentConfig{}, false) + transport := client.Transport.(*http.Transport) + assert.False(transport.TLSClientConfig.InsecureSkipVerify) + assert.Nil(transport.Proxy) + }) + + t.Run("no_proxy", func(t *testing.T) { + client := newClient(&config.AgentConfig{ + SkipSSLValidation: true, + ProxyURL: url, + }, true) + transport := client.Transport.(*http.Transport) + assert.True(transport.TLSClientConfig.InsecureSkipVerify) + assert.Nil(transport.Proxy) + }) + + t.Run("proxy", func(t *testing.T) { + client := newClient(&config.AgentConfig{ProxyURL: url}, false) + transport := client.Transport.(*http.Transport) + goturl, _ := transport.Proxy(nil) + assert.False(transport.TLSClientConfig.InsecureSkipVerify) + assert.Equal("test_url", goturl.String()) + }) +} + +func TestNewEndpoints(t *testing.T) { + t.Run("disabled", func(t *testing.T) { + e := newEndpoints(&config.AgentConfig{Enabled: false}, "") + _, ok := e[0].(*nullEndpoint) + assert.True(t, ok) + }) + + t.Run("panic", func(t *testing.T) { + for name, tt := range map[string]struct { + cfg *config.AgentConfig + err string + }{ + "key": {&config.AgentConfig{Enabled: true}, "must have at least one endpoint with key"}, + "key2": {&config.AgentConfig{Enabled: true, Endpoints: []*config.Endpoint{{Host: "123"}}}, "must have at least one endpoint with key"}, + "endpoint": {&config.AgentConfig{Enabled: true, Endpoints: []*config.Endpoint{{APIKey: "123"}}}, "must have at least one endpoint with key"}, + } { + t.Run(name, func(t *testing.T) { + defer func() { + if e, ok := recover().(error); !ok || e == nil { + t.Fatal("expected panic") + } else { + if e.Error() != tt.err { + t.Fatalf("invalid error, got %q", e.Error()) + } + } + }() + newEndpoints(tt.cfg, "") + }) + } + }) + + t.Run("ok", func(t *testing.T) { + for name, tt := range map[string]struct { + cfg *config.AgentConfig + path string + exp []*datadogEndpoint + }{ + "main": { + cfg: &config.AgentConfig{Enabled: true, Endpoints: []*config.Endpoint{{Host: "host1", APIKey: "key1"}}}, + path: "/api/trace", + exp: []*datadogEndpoint{{host: "host1", apiKey: "key1", path: "/api/trace"}}, + }, + "additional": { + cfg: &config.AgentConfig{ + Enabled: true, + Endpoints: []*config.Endpoint{ + {Host: "host1", APIKey: "key1"}, + {Host: "host2", APIKey: "key2"}, + {Host: "host3", APIKey: "key3"}, + {Host: "host4", APIKey: "key4"}, + }, + }, + path: "/api/trace", + exp: []*datadogEndpoint{ + {host: "host1", apiKey: "key1", path: "/api/trace"}, + {host: "host2", apiKey: "key2", path: "/api/trace"}, + {host: "host3", apiKey: "key3", path: "/api/trace"}, + {host: "host4", apiKey: "key4", path: "/api/trace"}, + }, + }, + } { + t.Run(name, func(t *testing.T) { + assert := assert.New(t) + e := newEndpoints(tt.cfg, tt.path) + for i, want := range tt.exp { + got := e[i].(*datadogEndpoint) + assert.Equal(want.host, got.host) + assert.Equal(want.apiKey, got.apiKey) + assert.Equal(want.path, got.path) + } + }) + } + }) + + t.Run("proxy", func(t *testing.T) { + assert := assert.New(t) + proxyURL, err := url.Parse("test_url") + if err != nil { + t.Fatal(err) + } + e := newEndpoints(&config.AgentConfig{ + Enabled: true, + ProxyURL: proxyURL, + Endpoints: []*config.Endpoint{ + {Host: "host1", APIKey: "key1"}, + {Host: "host2", APIKey: "key2"}, + {Host: "host3", APIKey: "key3", NoProxy: true}, + }, + }, "/api/trace") + + // proxy ok + for _, i := range []int{0, 1} { + tr := e[i].(*datadogEndpoint).client.Transport.(*http.Transport) + p, _ := tr.Proxy(nil) + assert.Equal("test_url", p.String()) + } + + // proxy skipped + tr := e[2].(*datadogEndpoint).client.Transport.(*http.Transport) + assert.Nil(tr.Proxy) + }) +} diff --git a/pkg/trace/writer/fixtures_test.go b/pkg/trace/writer/fixtures_test.go new file mode 100644 index 0000000000000..94b82e84aa8c6 --- /dev/null +++ b/pkg/trace/writer/fixtures_test.go @@ -0,0 +1,218 @@ +package writer + +import ( + "math/rand" + "sync" + + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" +) + +// payloadConstructedHandlerArgs encodes the arguments passed to a PayloadConstructedHandler call. +type payloadConstructedHandlerArgs struct { + payload *payload + stats interface{} +} + +// testEndpoint represents a mocked endpoint that replies with a configurable error and records successful and failed +// payloads. +type testEndpoint struct { + sync.RWMutex + err error + successPayloads []*payload + errorPayloads []*payload +} + +func (e *testEndpoint) baseURL() string { return "" } + +// Write mocks the writing of a payload to a remote endpoint, recording it and replying with the configured error (or +// success in its absence). +func (e *testEndpoint) write(payload *payload) error { + e.Lock() + defer e.Unlock() + if e.err != nil { + e.errorPayloads = append(e.errorPayloads, payload) + } else { + e.successPayloads = append(e.successPayloads, payload) + } + return e.err +} + +func (e *testEndpoint) Error() error { + e.RLock() + defer e.RUnlock() + return e.err +} + +// ErrorPayloads returns all the error payloads registered with the test endpoint. +func (e *testEndpoint) ErrorPayloads() []*payload { + e.RLock() + defer e.RUnlock() + return e.errorPayloads +} + +// SuccessPayloads returns all the success payloads registered with the test endpoint. +func (e *testEndpoint) SuccessPayloads() []*payload { + e.RLock() + defer e.RUnlock() + return e.successPayloads +} + +// SetError sets the passed error on the endpoint. +func (e *testEndpoint) SetError(err error) { + e.Lock() + defer e.Unlock() + e.err = err +} + +func (e *testEndpoint) String() string { + return "testEndpoint" +} + +// RandomPayload creates a new payload instance using random data and up to 32 bytes. +func randomPayload() *payload { + return randomSizedPayload(rand.Intn(32)) +} + +// randomSizedPayload creates a new payload instance using random data with the specified size. +func randomSizedPayload(size int) *payload { + return newPayload(testutil.RandomSizedBytes(size), testutil.RandomStringMap()) +} + +// testPayloadSender is a PayloadSender that is connected to a testEndpoint, used for testing. +type testPayloadSender struct { + *queuableSender + testEndpoint *testEndpoint +} + +// newTestPayloadSender creates a new instance of a testPayloadSender. +func newTestPayloadSender() *testPayloadSender { + testEndpoint := &testEndpoint{} + return &testPayloadSender{ + testEndpoint: testEndpoint, + queuableSender: newDefaultSender(testEndpoint), + } +} + +// Start asynchronously starts this payload sender. +func (c *testPayloadSender) Start() { + go c.Run() +} + +// Run executes the core loop of this sender. +func (c *testPayloadSender) Run() { + defer close(c.exit) + + for { + select { + case payload := <-c.in: + stats, err := c.doSend(payload) + + if err != nil { + c.notifyError(payload, err, stats) + } else { + c.notifySuccess(payload, stats) + } + case <-c.exit: + return + } + } +} + +// Payloads allows access to all payloads recorded as being successfully sent by this sender. +func (c *testPayloadSender) Payloads() []*payload { + return c.testEndpoint.SuccessPayloads() +} + +// Endpoint allows access to the underlying testEndpoint. +func (c *testPayloadSender) Endpoint() *testEndpoint { + return c.testEndpoint +} + +func (c *testPayloadSender) setEndpoint(e endpoint) { + c.testEndpoint = e.(*testEndpoint) +} + +// testPayloadSenderMonitor monitors a PayloadSender and stores all events +type testPayloadSenderMonitor struct { + events []monitorEvent + sender payloadSender + exit chan struct{} +} + +// newTestPayloadSenderMonitor creates a new testPayloadSenderMonitor monitoring the specified sender. +func newTestPayloadSenderMonitor(sender payloadSender) *testPayloadSenderMonitor { + return &testPayloadSenderMonitor{ + sender: sender, + exit: make(chan struct{}), + } +} + +// Start asynchronously starts this payload monitor. +func (m *testPayloadSenderMonitor) Start() { + go m.Run() +} + +// Run executes the core loop of this monitor. +func (m *testPayloadSenderMonitor) Run() { + defer close(m.exit) + + for { + select { + case event, ok := <-m.sender.Monitor(): + if !ok { + continue // wait for exit + } + m.events = append(m.events, event) + case <-m.exit: + return + } + } +} + +// Stop stops this payload monitor and waits for it to stop. +func (m *testPayloadSenderMonitor) Stop() { + m.exit <- struct{}{} + <-m.exit +} + +// SuccessPayloads returns a slice containing all successful payloads. +func (m *testPayloadSenderMonitor) SuccessPayloads() []*payload { + return m.eventPayloads(eventTypeSuccess) +} + +// FailurePayloads returns a slice containing all failed payloads. +func (m *testPayloadSenderMonitor) FailurePayloads() []*payload { + return m.eventPayloads(eventTypeFailure) +} + +// FailureEvents returns all failure events. +func (m *testPayloadSenderMonitor) FailureEvents() []monitorEvent { + return m.eventsByType(eventTypeFailure) +} + +// RetryPayloads returns a slice containing all failed payloads. +func (m *testPayloadSenderMonitor) RetryPayloads() []*payload { + return m.eventPayloads(eventTypeRetry) +} + +func (m *testPayloadSenderMonitor) eventPayloads(t eventType) []*payload { + res := make([]*payload, 0) + for _, e := range m.events { + if e.typ != t { + continue + } + res = append(res, e.payload) + } + return res +} + +func (m *testPayloadSenderMonitor) eventsByType(t eventType) []monitorEvent { + res := make([]monitorEvent, 0) + for _, e := range m.events { + if e.typ != t { + continue + } + res = append(res, e) + } + return res +} diff --git a/pkg/trace/writer/multi.go b/pkg/trace/writer/multi.go new file mode 100644 index 0000000000000..c663300ac50ab --- /dev/null +++ b/pkg/trace/writer/multi.go @@ -0,0 +1,77 @@ +package writer + +import ( + "sync" + + "github.com/DataDog/datadog-agent/pkg/trace/writer/config" +) + +var _ payloadSender = (*multiSender)(nil) + +// multiSender is an implementation of payloadSender which forwards any +// received payload to multiple payloadSenders, funnelling incoming monitor +// events. +type multiSender struct { + senders []payloadSender + mwg sync.WaitGroup // monitor funnel waitgroup + mch chan monitorEvent // monitor funneling channel +} + +// newMultiSender returns a new payloadSender which forwards all sent payloads to all +// the given endpoints, as well as funnels all monitoring channels. +func newMultiSender(endpoints []endpoint, cfg config.QueuablePayloadSenderConf) payloadSender { + if len(endpoints) == 1 { + return newSender(endpoints[0], cfg) + } + senders := make([]payloadSender, len(endpoints)) + for i, e := range endpoints { + senders[i] = newSender(e, cfg) + } + return &multiSender{ + senders: senders, + mch: make(chan monitorEvent, len(senders)), + } +} + +// Start starts all senders. +func (w *multiSender) Start() { + for _, sender := range w.senders { + sender.Start() + } + for _, sender := range w.senders { + w.mwg.Add(1) + go func(ch <-chan monitorEvent) { + defer w.mwg.Done() + for event := range ch { + w.mch <- event + } + }(sender.Monitor()) + } +} + +// Stop stops all senders. +func (w *multiSender) Stop() { + for _, sender := range w.senders { + sender.Stop() + } + w.mwg.Wait() + close(w.mch) +} + +// Send forwards the payload to all registered senders. +func (w *multiSender) Send(p *payload) { + for _, sender := range w.senders { + sender.Send(p) + } +} + +func (w *multiSender) Monitor() <-chan monitorEvent { return w.mch } + +// Run implements payloadSender. +func (w *multiSender) Run() { /* no-op */ } + +func (w *multiSender) setEndpoint(endpoint endpoint) { + for _, sender := range w.senders { + sender.setEndpoint(endpoint) + } +} diff --git a/pkg/trace/writer/multi_test.go b/pkg/trace/writer/multi_test.go new file mode 100644 index 0000000000000..1243eb43c5c58 --- /dev/null +++ b/pkg/trace/writer/multi_test.go @@ -0,0 +1,203 @@ +package writer + +import ( + "os" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + "github.com/stretchr/testify/assert" +) + +func TestMain(m *testing.M) { + testStatsClient := &testutil.TestStatsClient{} + originalClient := metrics.Client + metrics.Client = testStatsClient + defer func() { + metrics.Client = originalClient + }() + os.Exit(m.Run()) +} + +func TestNewMultiSenderFactory(t *testing.T) { + cfg := config.DefaultQueuablePayloadSenderConf() + + t.Run("one", func(t *testing.T) { + e := &datadogEndpoint{host: "host1", apiKey: "key1"} + sender, ok := newMultiSender([]endpoint{e}, cfg).(*queuableSender) + assert := assert.New(t) + assert.True(ok) + assert.EqualValues(e, sender.endpoint) + assert.EqualValues(cfg, sender.conf) + }) + + t.Run("multi", func(t *testing.T) { + endpoints := []endpoint{ + &datadogEndpoint{host: "host1", apiKey: "key1"}, + &datadogEndpoint{host: "host2", apiKey: "key2"}, + &datadogEndpoint{host: "host3", apiKey: "key3"}, + } + sender, ok := newMultiSender(endpoints, cfg).(*multiSender) + assert := assert.New(t) + assert.True(ok) + assert.Len(sender.senders, 3) + assert.Equal(3, cap(sender.mch)) + for i := range endpoints { + s, ok := sender.senders[i].(*queuableSender) + assert.True(ok) + assert.EqualValues(endpoints[i], s.endpoint) + assert.EqualValues(cfg, s.conf) + } + }) +} + +func TestMultiSender(t *testing.T) { + t.Run("Start", func(t *testing.T) { + mock1 := newMockSender() + mock2 := newMockSender() + multi := &multiSender{senders: []payloadSender{mock1, mock2}, mch: make(chan monitorEvent)} + multi.Start() + defer multi.Stop() + + assert := assert.New(t) + assert.Equal(1, mock1.StartCalls()) + assert.Equal(1, mock2.StartCalls()) + }) + + t.Run("Stop", func(t *testing.T) { + mock1 := newMockSender() + mock2 := newMockSender() + multi := &multiSender{senders: []payloadSender{mock1, mock2}, mch: make(chan monitorEvent)} + multi.Stop() + + assert := assert.New(t) + assert.Equal(1, mock1.StopCalls()) + assert.Equal(1, mock2.StopCalls()) + + select { + case <-multi.mch: + default: + t.Fatal("monitor channel should be closed") + } + }) + + t.Run("Send", func(t *testing.T) { + mock1 := newMockSender() + mock2 := newMockSender() + p := &payload{creationDate: time.Now(), bytes: []byte{1, 2, 3}} + multi := &multiSender{senders: []payloadSender{mock1, mock2}, mch: make(chan monitorEvent)} + multi.Send(p) + + assert := assert.New(t) + assert.Equal(p, mock1.SendCalls()[0]) + assert.Equal(p, mock2.SendCalls()[0]) + }) + + t.Run("funnel", func(t *testing.T) { + mock1 := newMockSender() + mock2 := newMockSender() + multi := &multiSender{senders: []payloadSender{mock1, mock2}, mch: make(chan monitorEvent)} + multi.Start() + defer multi.Stop() + + event1 := monitorEvent{typ: eventTypeSuccess, stats: sendStats{host: "ABC"}} + event2 := monitorEvent{typ: eventTypeFailure, stats: sendStats{host: "QWE"}} + + mock1.monitorCh <- event1 + mock2.monitorCh <- event2 + + assert.ElementsMatch(t, + []monitorEvent{event1, event2}, + []monitorEvent{<-multi.mch, <-multi.mch}, + ) + }) +} + +func TestMockPayloadSender(t *testing.T) { + p := &payload{creationDate: time.Now(), bytes: []byte{1, 2, 3}} + mock := newMockSender() + mock.Start() + mock.Start() + mock.Start() + mock.Send(p) + mock.Send(p) + mock.Stop() + + assert := assert.New(t) + assert.Equal(3, mock.StartCalls()) + assert.Equal(p, mock.SendCalls()[0]) + assert.Equal(p, mock.SendCalls()[1]) + assert.Equal(1, mock.StopCalls()) + + mock.Reset() + assert.Equal(0, mock.StartCalls()) + assert.Equal(0, mock.StopCalls()) + assert.Len(mock.SendCalls(), 0) +} + +var _ payloadSender = (*mockPayloadSender)(nil) + +type mockPayloadSender struct { + startCalls uint64 + stopCalls uint64 + + mu sync.Mutex + sendCalls []*payload + monitorCh chan monitorEvent +} + +func newMockSender() *mockPayloadSender { + return &mockPayloadSender{monitorCh: make(chan monitorEvent)} +} + +func (m *mockPayloadSender) Reset() { + atomic.SwapUint64(&m.startCalls, 0) + atomic.SwapUint64(&m.stopCalls, 0) + m.mu.Lock() + m.sendCalls = m.sendCalls[:0] + m.monitorCh = make(chan monitorEvent) + m.mu.Unlock() +} + +func (m *mockPayloadSender) Start() { + atomic.AddUint64(&m.startCalls, 1) +} + +func (m *mockPayloadSender) StartCalls() int { + return int(atomic.LoadUint64(&m.startCalls)) +} + +// Stop must be called only once. It closes the monitor channel. +func (m *mockPayloadSender) Stop() { + atomic.AddUint64(&m.stopCalls, 1) + close(m.monitorCh) +} + +func (m *mockPayloadSender) StopCalls() int { + return int(atomic.LoadUint64(&m.stopCalls)) +} + +func (m *mockPayloadSender) Send(p *payload) { + m.mu.Lock() + m.sendCalls = append(m.sendCalls, p) + m.mu.Unlock() +} + +func (m *mockPayloadSender) SendCalls() []*payload { + m.mu.Lock() + defer m.mu.Unlock() + return m.sendCalls +} + +func (m *mockPayloadSender) Monitor() <-chan monitorEvent { + m.mu.Lock() + defer m.mu.Unlock() + return m.monitorCh +} + +func (m *mockPayloadSender) Run() {} +func (m *mockPayloadSender) setEndpoint(_ endpoint) {} diff --git a/pkg/trace/writer/payload.go b/pkg/trace/writer/payload.go new file mode 100644 index 0000000000000..e85f6e6e75eba --- /dev/null +++ b/pkg/trace/writer/payload.go @@ -0,0 +1,380 @@ +package writer + +import ( + "container/list" + "fmt" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + "github.com/DataDog/datadog-agent/pkg/trace/writer/backoff" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + log "github.com/cihub/seelog" +) + +// payload represents a data payload to be sent to some endpoint +type payload struct { + creationDate time.Time + bytes []byte + headers map[string]string +} + +// newPayload constructs a new payload object with the provided data and with CreationDate initialized to the current +// time. +func newPayload(bytes []byte, headers map[string]string) *payload { + return &payload{ + creationDate: time.Now(), + bytes: bytes, + headers: headers, + } +} + +// eventType is a type of event sent down the monitor channel. +type eventType int + +const ( + eventTypeSuccess eventType = iota + eventTypeFailure + eventTypeRetry +) + +var eventTypeStrings = map[eventType]string{ + eventTypeSuccess: "success", + eventTypeFailure: "failure", + eventTypeRetry: "retry", +} + +func (e eventType) String() string { return eventTypeStrings[e] } + +type monitorEvent struct { + typ eventType + payload *payload + stats sendStats + err error + retryDelay time.Duration + retryNum int +} + +// sendStats represents basic stats related to the sending of a payload. +type sendStats struct { + sendTime time.Duration + host string +} + +// payloadSender represents an object capable of asynchronously sending payloads to some endpoint. +type payloadSender interface { + Start() + Run() + Stop() + Send(payload *payload) + Monitor() <-chan monitorEvent + + setEndpoint(endpoint) +} + +// queuableSender is a specific implementation of a payloadSender that will queue new payloads on error and +// retry sending them according to some configurable BackoffTimer. +type queuableSender struct { + conf writerconfig.QueuablePayloadSenderConf + queuedPayloads *list.List + queuing bool + currentQueuedSize int64 + + backoffTimer backoff.Timer + + // Test helper + syncBarrier <-chan interface{} + + in chan *payload + monitorCh chan monitorEvent + endpoint endpoint + + exit chan struct{} +} + +// newDefaultSender constructs a new queuableSender with default configuration to send payloads to the +// provided endpoint. +func newDefaultSender(e endpoint) *queuableSender { + return newSender(e, writerconfig.DefaultQueuablePayloadSenderConf()) +} + +// newSender constructs a new QueuablePayloadSender with custom configuration to send payloads to +// the provided endpoint. +func newSender(e endpoint, conf writerconfig.QueuablePayloadSenderConf) *queuableSender { + return &queuableSender{ + conf: conf, + queuedPayloads: list.New(), + backoffTimer: backoff.NewCustomExponentialTimer(conf.ExponentialBackoff), + in: make(chan *payload), + monitorCh: make(chan monitorEvent), + endpoint: e, + exit: make(chan struct{}), + } +} + +// Send sends a single isolated payload through this sender. +func (s *queuableSender) Send(payload *payload) { + s.in <- payload +} + +// Stop asks this sender to stop and waits until it correctly stops. +func (s *queuableSender) Stop() { + s.exit <- struct{}{} + <-s.exit + close(s.in) + close(s.monitorCh) +} + +func (s *queuableSender) setEndpoint(e endpoint) { + s.endpoint = e +} + +// Monitor allows an external entity to monitor events of this sender by receiving Sender*Event structs. +func (s *queuableSender) Monitor() <-chan monitorEvent { + return s.monitorCh +} + +// send will send the provided payload without any checks. +func (s *queuableSender) doSend(payload *payload) (sendStats, error) { + if payload == nil { + return sendStats{}, nil + } + + startFlush := time.Now() + err := s.endpoint.write(payload) + + sendStats := sendStats{ + sendTime: time.Since(startFlush), + host: s.endpoint.baseURL(), + } + + return sendStats, err +} + +// Start asynchronously starts this QueueablePayloadSender. +func (s *queuableSender) Start() { + go func() { + defer watchdog.LogOnPanic() + s.Run() + }() +} + +// Run executes the queuableSender main logic synchronously. +func (s *queuableSender) Run() { + defer close(s.exit) + + for { + select { + case payload := <-s.in: + if stats, err := s.sendOrQueue(payload); err != nil { + log.Debugf("Error while sending or queueing payload. err=%v", err) + s.notifyError(payload, err, stats) + } + case <-s.backoffTimer.ReceiveTick(): + s.flushQueue() + case <-s.syncBarrier: + // TODO: Is there a way of avoiding this? I want Promises in Go :((( + // This serves as a barrier (assuming syncBarrier is an unbuffered channel). Used for testing + continue + case <-s.exit: + log.Info("exiting payload sender, try flushing whatever is left") + s.flushQueue() + return + } + } +} + +// NumQueuedPayloads returns the number of payloads currently waiting in the queue for a retry +func (s *queuableSender) NumQueuedPayloads() int { + return s.queuedPayloads.Len() +} + +// sendOrQueue sends the provided payload or queues it if this sender is currently queueing payloads. +func (s *queuableSender) sendOrQueue(payload *payload) (sendStats, error) { + var stats sendStats + + if payload == nil { + return stats, nil + } + + var err error + + if !s.queuing { + if stats, err = s.doSend(payload); err != nil { + if _, ok := err.(*retriableError); ok { + // If error is retriable, start a queue and schedule a retry + retryNum, delay := s.backoffTimer.ScheduleRetry(err) + log.Debugf("Got retriable error. Starting a queue. delay=%s, err=%v", delay, err) + s.notifyRetry(payload, err, delay, retryNum) + return stats, s.enqueue(payload) + } + } else { + // If success, notify + log.Tracef("Successfully sent direct payload: %v", payload) + s.notifySuccess(payload, stats) + } + } else { + return stats, s.enqueue(payload) + } + + return stats, err +} + +func (s *queuableSender) enqueue(payload *payload) error { + if !s.queuing { + s.queuing = true + } + + // Start by discarding payloads that are too old, freeing up memory + s.discardOldPayloads() + + for s.conf.MaxQueuedPayloads > 0 && s.queuedPayloads.Len() >= s.conf.MaxQueuedPayloads { + log.Debugf("Dropping existing payload because max queued payloads reached: %d", s.conf.MaxQueuedPayloads) + if _, err := s.dropOldestPayload("max queued payloads reached"); err != nil { + panic(fmt.Errorf("unable to respect max queued payloads value of %d", s.conf.MaxQueuedPayloads)) + } + } + + newPayloadSize := int64(len(payload.bytes)) + + if s.conf.MaxQueuedBytes > 0 && newPayloadSize > s.conf.MaxQueuedBytes { + log.Debugf("Payload bigger than max size: size=%d, max size=%d", newPayloadSize, s.conf.MaxQueuedBytes) + return fmt.Errorf("unable to queue payload bigger than max size: payload size=%d, max size=%d", + newPayloadSize, s.conf.MaxQueuedBytes) + } + + for s.conf.MaxQueuedBytes > 0 && s.currentQueuedSize+newPayloadSize > s.conf.MaxQueuedBytes { + if _, err := s.dropOldestPayload("max queued bytes reached"); err != nil { + // Should never happen because we know we can fit it in + panic(fmt.Errorf("unable to find space for queueing payload of size %d: %v", newPayloadSize, err)) + } + } + + log.Tracef("Queuing new payload: %v", payload) + s.queuedPayloads.PushBack(payload) + s.currentQueuedSize += newPayloadSize + + return nil +} + +func (s *queuableSender) flushQueue() error { + log.Debugf("Attempting to flush queue with %d payloads", s.NumQueuedPayloads()) + + // Start by discarding payloads that are too old + s.discardOldPayloads() + + // For the remaining ones, try to send them one by one + var next *list.Element + for e := s.queuedPayloads.Front(); e != nil; e = next { + payload := e.Value.(*payload) + + var err error + var stats sendStats + + if stats, err = s.doSend(payload); err != nil { + if _, ok := err.(*retriableError); ok { + // If send failed due to a retriable error, retry flush later + retryNum, delay := s.backoffTimer.ScheduleRetry(err) + log.Debugf("Got retriable error. Retrying flush later: retry=%d, delay=%s, err=%v", + retryNum, delay, err) + s.notifyRetry(payload, err, delay, retryNum) + // Don't try to send following. We'll flush all later. + return err + } + + // If send failed due to non-retriable error, notify error and drop it + log.Debugf("Dropping payload due to non-retriable error: err=%v, payload=%v", err, payload) + s.notifyError(payload, err, stats) + next = s.removeQueuedPayload(e) + // Try sending next ones + continue + } + + // If successful, remove payload from queue + log.Tracef("Successfully sent a queued payload: %v", payload) + s.notifySuccess(payload, stats) + next = s.removeQueuedPayload(e) + } + + s.queuing = false + s.backoffTimer.Reset() + + return nil +} + +func (s *queuableSender) removeQueuedPayload(e *list.Element) *list.Element { + next := e.Next() + payload := e.Value.(*payload) + s.currentQueuedSize -= int64(len(payload.bytes)) + s.queuedPayloads.Remove(e) + return next +} + +// Discard those payloads that are older than max age. +func (s *queuableSender) discardOldPayloads() { + // If MaxAge <= 0 then age limitation is disabled so do nothing + if s.conf.MaxAge <= 0 { + return + } + + var next *list.Element + + for e := s.queuedPayloads.Front(); e != nil; e = next { + payload := e.Value.(*payload) + + age := time.Since(payload.creationDate) + + // Payloads are kept in order so as soon as we find one that isn't, we can break out + if age < s.conf.MaxAge { + break + } + + err := fmt.Errorf("payload is older than max age: age=%v, max age=%v", age, s.conf.MaxAge) + log.Tracef("Discarding payload: err=%v, payload=%v", err, payload) + s.notifyError(payload, err, sendStats{}) + next = s.removeQueuedPayload(e) + } +} + +// Payloads are kept in order so dropping the one at the front guarantees we're dropping the oldest +func (s *queuableSender) dropOldestPayload(reason string) (*payload, error) { + if s.queuedPayloads.Len() == 0 { + return nil, fmt.Errorf("no queued payloads") + } + + err := fmt.Errorf("payload dropped: %s", reason) + droppedPayload := s.queuedPayloads.Front().Value.(*payload) + s.removeQueuedPayload(s.queuedPayloads.Front()) + s.notifyError(droppedPayload, err, sendStats{}) + + return droppedPayload, nil +} + +func (s *queuableSender) notifySuccess(payload *payload, sendStats sendStats) { + s.sendEvent(&monitorEvent{ + typ: eventTypeSuccess, + payload: payload, + stats: sendStats, + }) +} + +func (s *queuableSender) notifyError(payload *payload, err error, sendStats sendStats) { + s.sendEvent(&monitorEvent{ + typ: eventTypeFailure, + payload: payload, + err: err, + }) +} + +func (s *queuableSender) notifyRetry(payload *payload, err error, delay time.Duration, retryNum int) { + s.sendEvent(&monitorEvent{ + typ: eventTypeRetry, + payload: payload, + err: err, + retryDelay: delay, + retryNum: retryNum, + }) +} + +func (s *queuableSender) sendEvent(event *monitorEvent) { + s.monitorCh <- *event +} diff --git a/pkg/trace/writer/payload_test.go b/pkg/trace/writer/payload_test.go new file mode 100644 index 0000000000000..e03eef07e29b7 --- /dev/null +++ b/pkg/trace/writer/payload_test.go @@ -0,0 +1,540 @@ +package writer + +import ( + "fmt" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + "github.com/DataDog/datadog-agent/pkg/trace/writer/backoff" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + "github.com/stretchr/testify/assert" +) + +func TestNewPayloadSetsCreationDate(t *testing.T) { + assert := assert.New(t) + + p := newPayload(nil, nil) + + assert.WithinDuration(time.Now(), p.creationDate, 1*time.Second) +} + +func TestQueuablePayloadSender_WorkingEndpoint(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that doesn't fail + workingEndpoint := &testEndpoint{} + + // And a queuable sender using that endpoint + queuableSender := newDefaultSender(workingEndpoint) + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + // When we start the sender + monitor.Start() + queuableSender.Start() + + // And send some payloads + payload1 := randomPayload() + queuableSender.Send(payload1) + payload2 := randomPayload() + queuableSender.Send(payload2) + payload3 := randomPayload() + queuableSender.Send(payload3) + payload4 := randomPayload() + queuableSender.Send(payload4) + payload5 := randomPayload() + queuableSender.Send(payload5) + + // And stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then we expect all sent payloads to have been successfully sent + successPayloads := monitor.SuccessPayloads() + errorPayloads := monitor.FailurePayloads() + assert.Equal([]*payload{payload1, payload2, payload3, payload4, payload5}, successPayloads, + "Expect all sent payloads to have been successful") + assert.Equal(successPayloads, workingEndpoint.SuccessPayloads(), "Expect sender and endpoint to match on successful payloads") + assert.Len(errorPayloads, 0, "No payloads should have errored out on send") + assert.Len(workingEndpoint.ErrorPayloads(), 0, "No payloads should have errored out on send") +} + +func TestQueuablePayloadSender_FlakyEndpoint(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that initially works ok + flakyEndpoint := &testEndpoint{} + + // And a test backoff timer that can be triggered on-demand + testBackoffTimer := testutil.NewTestBackoffTimer() + + // And a queuable sender using said endpoint and timer + conf := writerconfig.DefaultQueuablePayloadSenderConf() + queuableSender := newSender(flakyEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // With a working endpoint + // We send some payloads + payload1 := randomPayload() + queuableSender.Send(payload1) + payload2 := randomPayload() + queuableSender.Send(payload2) + + // Make sure sender processed both payloads + syncBarrier <- nil + + assert.Equal(0, queuableSender.NumQueuedPayloads(), "Expect no queued payloads") + + // With a failing endpoint with a retriable error + flakyEndpoint.SetError(&retriableError{err: fmt.Errorf("bleh"), endpoint: flakyEndpoint}) + // We send some payloads + payload3 := randomPayload() + queuableSender.Send(payload3) + payload4 := randomPayload() + queuableSender.Send(payload4) + // And retry once + testBackoffTimer.TriggerTick() + // And retry twice + testBackoffTimer.TriggerTick() + + // Make sure sender processed both ticks + syncBarrier <- nil + + assert.Equal(2, queuableSender.NumQueuedPayloads(), "Expect 2 queued payloads") + + // With the previously failing endpoint working again + flakyEndpoint.SetError(nil) + // We retry for the third time + testBackoffTimer.TriggerTick() + + // Make sure sender processed previous tick + syncBarrier <- nil + + assert.Equal(0, queuableSender.NumQueuedPayloads(), "Expect no queued payloads") + + // Finally, with a failing endpoint with a non-retriable error + flakyEndpoint.SetError(fmt.Errorf("non retriable bleh")) + // We send some payloads + payload5 := randomPayload() + queuableSender.Send(payload5) + payload6 := randomPayload() + queuableSender.Send(payload6) + + // Make sure sender processed previous payloads + syncBarrier <- nil + + assert.Equal(0, queuableSender.NumQueuedPayloads(), "Expect no queued payloads") + + // With the previously failing endpoint working again + flakyEndpoint.SetError(nil) + // We retry just in case there's something in the queue + testBackoffTimer.TriggerTick() + + // And stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then we expect payloads sent during working endpoint or those that were retried due to retriable errors to have + // been sent eventually (and in order). Those that failed because of non-retriable errors should have been discarded + // even after a retry. + successPayloads := monitor.SuccessPayloads() + errorPayloads := monitor.FailurePayloads() + retryPayloads := monitor.RetryPayloads() + assert.Equal([]*payload{payload1, payload2, payload3, payload4}, successPayloads, + "Expect all sent payloads to have been successful") + assert.Equal(successPayloads, flakyEndpoint.SuccessPayloads(), "Expect sender and endpoint to match on successful payloads") + // Expect 3 retry events for payload 3 (one because of first send, two others because of the two retries) + assert.Equal([]*payload{payload3, payload3, payload3}, retryPayloads, "Expect payload 3 to have been retries 3 times") + // We expect payloads 5 and 6 to appear in error payloads as they failed for non-retriable errors. + assert.Equal([]*payload{payload5, payload6}, errorPayloads, "Expect errored payloads to have been discarded as expected") +} + +func TestQueuablePayloadSender_MaxQueuedPayloads(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that continuously throws out retriable errors + flakyEndpoint := &testEndpoint{} + flakyEndpoint.SetError(&retriableError{err: fmt.Errorf("bleh"), endpoint: flakyEndpoint}) + + // And a test backoff timer that can be triggered on-demand + testBackoffTimer := testutil.NewTestBackoffTimer() + + // And a queuable sender using said endpoint and timer and with a meager max queued payloads value of 1 + conf := writerconfig.DefaultQueuablePayloadSenderConf() + conf.MaxQueuedPayloads = 1 + queuableSender := newSender(flakyEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // When sending a first payload + payload1 := randomPayload() + queuableSender.Send(payload1) + + // Followed by another one + payload2 := randomPayload() + queuableSender.Send(payload2) + + // Followed by a third + payload3 := randomPayload() + queuableSender.Send(payload3) + + // Ensure previous payloads were processed + syncBarrier <- nil + + // Then, when the endpoint finally works + flakyEndpoint.SetError(nil) + + // And we trigger a retry + testBackoffTimer.TriggerTick() + + // Ensure tick was processed + syncBarrier <- nil + + // Then we should have no queued payloads + assert.Equal(0, queuableSender.NumQueuedPayloads(), "We should have no queued payloads") + + // When we stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then endpoint should have received only payload3. Other should have been discarded because max queued payloads + // is 1 + assert.Equal([]*payload{payload3}, flakyEndpoint.SuccessPayloads(), "Endpoint should have received only payload 3") + + // Monitor should agree on previous fact + assert.Equal([]*payload{payload3}, monitor.SuccessPayloads(), + "Monitor should agree with endpoint on succesful payloads") + assert.Equal([]*payload{payload1, payload2}, monitor.FailurePayloads(), + "Monitor should agree with endpoint on failed payloads") + assert.Contains(monitor.FailureEvents()[0].err.Error(), "max queued payloads", + "Monitor failure event should mention correct reason for error") + assert.Contains(monitor.FailureEvents()[1].err.Error(), "max queued payloads", + "Monitor failure event should mention correct reason for error") +} + +func TestQueuablePayloadSender_MaxQueuedBytes(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that continuously throws out retriable errors + flakyEndpoint := &testEndpoint{} + flakyEndpoint.SetError(&retriableError{err: fmt.Errorf("bleh"), endpoint: flakyEndpoint}) + + // And a test backoff timer that can be triggered on-demand + testBackoffTimer := testutil.NewTestBackoffTimer() + + // And a queuable sender using said endpoint and timer and with a meager max size of 10 bytes + conf := writerconfig.DefaultQueuablePayloadSenderConf() + conf.MaxQueuedBytes = 10 + queuableSender := newSender(flakyEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // When sending a first payload of 4 bytes + payload1 := randomSizedPayload(4) + queuableSender.Send(payload1) + + // Followed by another one of 2 bytes + payload2 := randomSizedPayload(2) + queuableSender.Send(payload2) + + // Followed by a third of 8 bytes + payload3 := randomSizedPayload(8) + queuableSender.Send(payload3) + + // Ensure previous payloads were processed + syncBarrier <- nil + + // Then, when the endpoint finally works + flakyEndpoint.SetError(nil) + + // And we trigger a retry + testBackoffTimer.TriggerTick() + + // Ensure tick was processed + syncBarrier <- nil + + // Then we should have no queued payloads + assert.Equal(0, queuableSender.NumQueuedPayloads(), "We should have no queued payloads") + + // When we stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then endpoint should have received payload2 and payload3. Payload1 should have been discarded because keeping all + // 3 would have put us over the max size of sender + assert.Equal([]*payload{payload2, payload3}, flakyEndpoint.SuccessPayloads(), + "Endpoint should have received only payload 2 and 3 (in that order)") + + // Monitor should agree on previous fact + assert.Equal([]*payload{payload2, payload3}, monitor.SuccessPayloads(), + "Monitor should agree with endpoint on succesful payloads") + assert.Equal([]*payload{payload1}, monitor.FailurePayloads(), + "Monitor should agree with endpoint on failed payloads") + assert.Contains(monitor.FailureEvents()[0].err.Error(), "max queued bytes", + "Monitor failure event should mention correct reason for error") +} + +func TestQueuablePayloadSender_DropBigPayloadsOnRetry(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that continuously throws out retriable errors + flakyEndpoint := &testEndpoint{} + flakyEndpoint.SetError(&retriableError{err: fmt.Errorf("bleh"), endpoint: flakyEndpoint}) + + // And a test backoff timer that can be triggered on-demand + testBackoffTimer := testutil.NewTestBackoffTimer() + + // And a queuable sender using said endpoint and timer and with a meager max size of 10 bytes + conf := writerconfig.DefaultQueuablePayloadSenderConf() + conf.MaxQueuedBytes = 10 + queuableSender := newSender(flakyEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // When sending a payload of 12 bytes + payload1 := randomSizedPayload(12) + queuableSender.Send(payload1) + + // Ensure previous payloads were processed + syncBarrier <- nil + + // Then, when the endpoint finally works + flakyEndpoint.SetError(nil) + + // And we trigger a retry + testBackoffTimer.TriggerTick() + + // Ensure tick was processed + syncBarrier <- nil + + // Then we should have no queued payloads + assert.Equal(0, queuableSender.NumQueuedPayloads(), "We should have no queued payloads") + + // When we stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then endpoint should have received no payloads because payload1 was too big to store in queue. + assert.Len(flakyEndpoint.SuccessPayloads(), 0, "Endpoint should have received no payloads") + + // And monitor should have received failed event for payload1 with correct reason + assert.Equal([]*payload{payload1}, monitor.FailurePayloads(), + "Monitor should agree with endpoint on failed payloads") + assert.Contains(monitor.FailureEvents()[0].err.Error(), "bigger than max size", + "Monitor failure event should mention correct reason for error") +} + +func TestQueuablePayloadSender_SendBigPayloadsIfNoRetry(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that works + workingEndpoint := &testEndpoint{} + + // And a test backoff timer that can be triggered on-demand + testBackoffTimer := testutil.NewTestBackoffTimer() + + // And a queuable sender using said endpoint and timer and with a meager max size of 10 bytes + conf := writerconfig.DefaultQueuablePayloadSenderConf() + conf.MaxQueuedBytes = 10 + queuableSender := newSender(workingEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // When sending a payload of 12 bytes + payload1 := randomSizedPayload(12) + queuableSender.Send(payload1) + + // Ensure previous payloads were processed + syncBarrier <- nil + + // Then we should have no queued payloads + assert.Equal(0, queuableSender.NumQueuedPayloads(), "We should have no queued payloads") + + // When we stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then endpoint should have received payload1 because although it was big, it didn't get queued. + assert.Equal([]*payload{payload1}, workingEndpoint.SuccessPayloads(), "Endpoint should have received payload1") + + // And monitor should have received success event for payload1 + assert.Equal([]*payload{payload1}, monitor.SuccessPayloads(), + "Monitor should agree with endpoint on success payloads") +} + +func TestQueuablePayloadSender_MaxAge(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that continuously throws out retriable errors + flakyEndpoint := &testEndpoint{} + flakyEndpoint.SetError(&retriableError{err: fmt.Errorf("bleh"), endpoint: flakyEndpoint}) + + // And a test backoff timer that can be triggered on-demand + testBackoffTimer := testutil.NewTestBackoffTimer() + + // And a queuable sender using said endpoint and timer and with a meager max age of 100ms + conf := writerconfig.DefaultQueuablePayloadSenderConf() + conf.MaxAge = 100 * time.Millisecond + queuableSender := newSender(flakyEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // When sending two payloads one after the other + payload1 := randomPayload() + queuableSender.Send(payload1) + payload2 := randomPayload() + queuableSender.Send(payload2) + + // And then sleeping for 500ms + time.Sleep(500 * time.Millisecond) + + // And then sending a third payload + payload3 := randomPayload() + queuableSender.Send(payload3) + + // And then triggering a retry + testBackoffTimer.TriggerTick() + + // Ensure tick was processed + syncBarrier <- nil + + // Then, when the endpoint finally works + flakyEndpoint.SetError(nil) + + // And we trigger a retry + testBackoffTimer.TriggerTick() + + // Ensure tick was processed + syncBarrier <- nil + + // Then we should have no queued payloads + assert.Equal(0, queuableSender.NumQueuedPayloads(), "We should have no queued payloads") + + // When we stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then endpoint should have received only payload3. Because payload1 and payload2 were too old after the failed + // retry (first TriggerTick). + assert.Equal([]*payload{payload3}, flakyEndpoint.SuccessPayloads(), "Endpoint should have received only payload 3") + + // And monitor should have received failed events for payload1 and payload2 with correct reason + assert.Equal([]*payload{payload1, payload2}, monitor.FailurePayloads(), + "Monitor should agree with endpoint on failed payloads") + assert.Contains(monitor.FailureEvents()[0].err.Error(), "older than max age", + "Monitor failure event should mention correct reason for error") +} + +func TestQueuablePayloadSender_RetryOfTooOldQueue(t *testing.T) { + assert := assert.New(t) + + // Given an endpoint that continuously throws out retriable errors + flakyEndpoint := &testEndpoint{} + flakyEndpoint.SetError(&retriableError{err: fmt.Errorf("bleh"), endpoint: flakyEndpoint}) + + // And a backoff timer that triggers every 100ms + testBackoffTimer := backoff.NewCustomTimer(func(numRetries int, err error) time.Duration { + return 100 * time.Millisecond + }) + + // And a queuable sender using said endpoint and timer and with a meager max age of 200ms + conf := writerconfig.DefaultQueuablePayloadSenderConf() + conf.MaxAge = 200 * time.Millisecond + queuableSender := newSender(flakyEndpoint, conf) + queuableSender.backoffTimer = testBackoffTimer + syncBarrier := make(chan interface{}) + queuableSender.syncBarrier = syncBarrier + + // And a test monitor for that sender + monitor := newTestPayloadSenderMonitor(queuableSender) + + monitor.Start() + queuableSender.Start() + + // When sending two payloads one after the other + payload1 := randomPayload() + queuableSender.Send(payload1) + payload2 := randomPayload() + queuableSender.Send(payload2) + + // And then sleeping for 500ms + time.Sleep(600 * time.Millisecond) + + // Then, eventually, during one of the retries those 2 payloads should end up being discarded and our queue + // will end up with a size of 0 and a flush call will be made for a queue of size 0 + + // Then send a third payload + payload3 := randomPayload() + queuableSender.Send(payload3) + + // Wait for payload to be queued + syncBarrier <- nil + + // Then, when the endpoint finally works + flakyEndpoint.SetError(nil) + + // Wait for a retry + time.Sleep(200 * time.Millisecond) + + // When we stop the sender + queuableSender.Stop() + monitor.Stop() + + // Then we should have no queued payloads + assert.Equal(0, queuableSender.NumQueuedPayloads(), "We should have no queued payloads") + + // Then endpoint should have received only payload3. Because payload1 and payload2 were too old after the failed + // retry (first TriggerTick). + assert.Equal([]*payload{payload3}, flakyEndpoint.SuccessPayloads(), "Endpoint should have received only payload 3") + + // And monitor should have received failed events for payload1 and payload2 with correct reason + assert.Equal([]*payload{payload1, payload2}, monitor.FailurePayloads(), + "Monitor should agree with endpoint on failed payloads") + assert.Contains(monitor.FailureEvents()[0].err.Error(), "older than max age", + "Monitor failure event should mention correct reason for error") +} diff --git a/pkg/trace/writer/service.go b/pkg/trace/writer/service.go new file mode 100644 index 0000000000000..d5baa90ea6d7e --- /dev/null +++ b/pkg/trace/writer/service.go @@ -0,0 +1,176 @@ +package writer + +import ( + "encoding/json" + "strings" + "sync/atomic" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + log "github.com/cihub/seelog" +) + +const pathServices = "/api/v0.2/services" + +// ServiceWriter ingests service metadata and flush them to the API. +type ServiceWriter struct { + stats info.ServiceWriterInfo + conf writerconfig.ServiceWriterConfig + InServices <-chan pb.ServicesMetadata + + serviceBuffer pb.ServicesMetadata + + sender payloadSender + exit chan struct{} +} + +// NewServiceWriter returns a new writer for services. +func NewServiceWriter(conf *config.AgentConfig, InServices <-chan pb.ServicesMetadata) *ServiceWriter { + cfg := conf.ServiceWriterConfig + endpoints := newEndpoints(conf, pathServices) + sender := newMultiSender(endpoints, cfg.SenderConfig) + log.Infof("Service writer initializing with config: %+v", cfg) + + return &ServiceWriter{ + conf: cfg, + InServices: InServices, + serviceBuffer: pb.ServicesMetadata{}, + sender: sender, + exit: make(chan struct{}), + } +} + +// Start starts the writer. +func (w *ServiceWriter) Start() { + w.sender.Start() + go func() { + defer watchdog.LogOnPanic() + w.Run() + }() +} + +// Run runs the main loop of the writer goroutine. If buffers +// services read from input chan and flushes them when necessary. +func (w *ServiceWriter) Run() { + defer close(w.exit) + + // for now, simply flush every x seconds + flushTicker := time.NewTicker(w.conf.FlushPeriod) + defer flushTicker.Stop() + + updateInfoTicker := time.NewTicker(w.conf.UpdateInfoPeriod) + defer updateInfoTicker.Stop() + + log.Debug("starting service writer") + + // Monitor sender for events + go func() { + for event := range w.sender.Monitor() { + switch event.typ { + case eventTypeSuccess: + url := event.stats.host + log.Infof("flushed service payload; url:%s, time:%s, size:%d bytes", url, event.stats.sendTime, + len(event.payload.bytes)) + tags := []string{"url:" + url} + metrics.Gauge("datadog.trace_agent.service_writer.flush_duration", + event.stats.sendTime.Seconds(), tags, 1) + atomic.AddInt64(&w.stats.Payloads, 1) + case eventTypeFailure: + url := event.stats.host + log.Errorf("failed to flush service payload; url:%s, time:%s, size:%d bytes, error: %s", + url, event.stats.sendTime, len(event.payload.bytes), event.err) + atomic.AddInt64(&w.stats.Errors, 1) + case eventTypeRetry: + log.Errorf("retrying flush service payload, retryNum: %d, delay:%s, error: %s", + event.retryNum, event.retryDelay, event.err) + atomic.AddInt64(&w.stats.Retries, 1) + default: + log.Debugf("don't know how to handle event with type %T", event) + } + } + }() + + // Main loop + for { + select { + case sm := <-w.InServices: + w.handleServiceMetadata(sm) + case <-flushTicker.C: + w.flush() + case <-updateInfoTicker.C: + go w.updateInfo() + case <-w.exit: + log.Info("exiting service writer, flushing all modified services") + w.flush() + return + } + } +} + +// Stop stops the main Run loop. +func (w *ServiceWriter) Stop() { + w.exit <- struct{}{} + <-w.exit + w.sender.Stop() +} + +func (w *ServiceWriter) handleServiceMetadata(metadata pb.ServicesMetadata) { + for k, v := range metadata { + w.serviceBuffer[k] = v + } +} + +func (w *ServiceWriter) flush() { + // If no services, we can't construct anything + if len(w.serviceBuffer) == 0 { + return + } + + numServices := len(w.serviceBuffer) + log.Debugf("going to flush updated service metadata, %d services", numServices) + atomic.StoreInt64(&w.stats.Services, int64(numServices)) + + data, err := json.Marshal(w.serviceBuffer) + if err != nil { + log.Errorf("error while encoding service payload: %v", err) + w.serviceBuffer = make(pb.ServicesMetadata) + return + } + + headers := map[string]string{ + languageHeaderKey: strings.Join(info.Languages(), "|"), + "Content-Type": "application/json", + } + + atomic.AddInt64(&w.stats.Bytes, int64(len(data))) + + payload := newPayload(data, headers) + w.sender.Send(payload) + + w.serviceBuffer = make(pb.ServicesMetadata) +} + +func (w *ServiceWriter) updateInfo() { + var swInfo info.ServiceWriterInfo + + // Load counters and reset them for the next flush + swInfo.Payloads = atomic.SwapInt64(&w.stats.Payloads, 0) + swInfo.Services = atomic.SwapInt64(&w.stats.Services, 0) + swInfo.Bytes = atomic.SwapInt64(&w.stats.Bytes, 0) + swInfo.Errors = atomic.SwapInt64(&w.stats.Errors, 0) + swInfo.Retries = atomic.SwapInt64(&w.stats.Retries, 0) + + // TODO(gbbr): Scope these stats per endpoint (see (config.AgentConfig).AdditionalEndpoints)) + metrics.Count("datadog.trace_agent.service_writer.payloads", int64(swInfo.Payloads), nil, 1) + metrics.Count("datadog.trace_agent.service_writer.services", int64(swInfo.Services), nil, 1) + metrics.Count("datadog.trace_agent.service_writer.bytes", int64(swInfo.Bytes), nil, 1) + metrics.Count("datadog.trace_agent.service_writer.retries", int64(swInfo.Retries), nil, 1) + metrics.Count("datadog.trace_agent.service_writer.errors", int64(swInfo.Errors), nil, 1) + + info.UpdateServiceWriterInfo(swInfo) +} diff --git a/pkg/trace/writer/service_test.go b/pkg/trace/writer/service_test.go new file mode 100644 index 0000000000000..4061f6fe91a3b --- /dev/null +++ b/pkg/trace/writer/service_test.go @@ -0,0 +1,211 @@ +package writer + +import ( + "encoding/json" + "fmt" + "math/rand" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + "github.com/stretchr/testify/assert" +) + +func TestServiceWriter_SenderMaxPayloads(t *testing.T) { + assert := assert.New(t) + + // Given a service writer + serviceWriter, _, _, _ := testServiceWriter() + + // When checking its default sender configuration + queuableSender := serviceWriter.sender.(*queuableSender) + + // Then the MaxQueuedPayloads setting should be -1 (unlimited) + assert.Equal(-1, queuableSender.conf.MaxQueuedPayloads) +} + +func TestServiceWriter_ServiceHandling(t *testing.T) { + assert := assert.New(t) + + // Given a service writer, its incoming channel and the endpoint that receives the payloads + serviceWriter, serviceChannel, testEndpoint, _ := testServiceWriter() + serviceWriter.conf.FlushPeriod = 100 * time.Millisecond + + serviceWriter.Start() + + // Given a set of service metadata + metadata1 := testutil.RandomServices(10, 10) + + // When sending it + serviceChannel <- metadata1 + + // And then immediately sending another set of service metadata + metadata2 := testutil.RandomServices(10, 10) + serviceChannel <- metadata2 + + // And then waiting for more than flush period + time.Sleep(2 * serviceWriter.conf.FlushPeriod) + + // And then sending a third set of service metadata + metadata3 := testutil.RandomServices(10, 10) + serviceChannel <- metadata3 + + // And stopping service writer before flush ticker ticks (should still flush on exit though) + close(serviceChannel) + serviceWriter.Stop() + + // Then the endpoint should have received 2 payloads, containing all sent metadata + expectedHeaders := map[string]string{ + "X-Datadog-Reported-Languages": strings.Join(info.Languages(), "|"), + "Content-Type": "application/json", + } + + mergedMetadata := mergeMetadataInOrder(metadata1, metadata2) + successPayloads := testEndpoint.SuccessPayloads() + + assert.Len(successPayloads, 2, "There should be 2 payloads") + assertMetadata(assert, expectedHeaders, mergedMetadata, successPayloads[0]) + assertMetadata(assert, expectedHeaders, metadata3, successPayloads[1]) +} + +func TestServiceWriter_UpdateInfoHandling(t *testing.T) { + rand.Seed(1) + assert := assert.New(t) + + // Given a service writer, its incoming channel and the endpoint that receives the payloads + serviceWriter, serviceChannel, testEndpoint, statsClient := testServiceWriter() + serviceWriter.conf.FlushPeriod = 100 * time.Millisecond + serviceWriter.conf.UpdateInfoPeriod = 100 * time.Millisecond + + serviceWriter.Start() + + expectedNumPayloads := int64(0) + expectedNumServices := int64(0) + expectedNumBytes := int64(0) + expectedMinNumRetries := int64(0) + expectedNumErrors := int64(0) + + // When sending a set of metadata + expectedNumPayloads++ + metadata1 := testutil.RandomServices(10, 10) + serviceChannel <- metadata1 + expectedNumServices += int64(len(metadata1)) + expectedNumBytes += calculateMetadataPayloadSize(metadata1) + + // And waiting for twice the flush period to trigger payload sending and info updating + time.Sleep(2 * serviceWriter.conf.FlushPeriod) + + // And then sending a second set of metadata + expectedNumPayloads++ + metadata2 := testutil.RandomServices(10, 10) + serviceChannel <- metadata2 + expectedNumServices += int64(len(metadata2)) + expectedNumBytes += calculateMetadataPayloadSize(metadata2) + + // And waiting for twice the flush period to trigger payload sending and info updating + time.Sleep(2 * serviceWriter.conf.FlushPeriod) + + // And then sending a third payload with other 3 traces with an errored out endpoint with no retry + testEndpoint.SetError(fmt.Errorf("non retriable error")) + expectedNumErrors++ + metadata3 := testutil.RandomServices(10, 10) + serviceChannel <- metadata3 + expectedNumServices += int64(len(metadata3)) + expectedNumBytes += calculateMetadataPayloadSize(metadata3) + + // And waiting for twice the flush period to trigger payload sending and info updating + time.Sleep(2 * serviceWriter.conf.FlushPeriod) + + // And then sending a third payload with other 3 traces with an errored out endpoint with retry + testEndpoint.SetError(&retriableError{ + err: fmt.Errorf("retriable error"), + endpoint: testEndpoint, + }) + expectedMinNumRetries++ + metadata4 := testutil.RandomServices(10, 10) + serviceChannel <- metadata4 + expectedNumServices += int64(len(metadata4)) + expectedNumBytes += calculateMetadataPayloadSize(metadata4) + + // And waiting for twice the flush period to trigger payload sending and info updating + time.Sleep(2 * serviceWriter.conf.FlushPeriod) + + close(serviceChannel) + serviceWriter.Stop() + + // Then we expect some counts and gauges to have been sent to the stats client for each update tick (there should + // have been at least 3 ticks) + countSummaries := statsClient.GetCountSummaries() + + // Payload counts + payloadSummary := countSummaries["datadog.trace_agent.service_writer.payloads"] + assert.True(len(payloadSummary.Calls) >= 3, "There should have been multiple payload count calls") + assert.Equal(expectedNumPayloads, payloadSummary.Sum) + + // Services count + servicesSummary := countSummaries["datadog.trace_agent.service_writer.services"] + assert.True(len(servicesSummary.Calls) >= 3, "There should have been multiple services gauge calls") + assert.EqualValues(expectedNumServices, servicesSummary.Sum) + + // Bytes counts + bytesSummary := countSummaries["datadog.trace_agent.service_writer.bytes"] + assert.True(len(bytesSummary.Calls) >= 3, "There should have been multiple bytes count calls") + assert.Equal(expectedNumBytes, bytesSummary.Sum) + + // Retry counts + retriesSummary := countSummaries["datadog.trace_agent.service_writer.retries"] + assert.True(len(retriesSummary.Calls) >= 2, "There should have been multiple retries count calls") + assert.True(retriesSummary.Sum >= expectedMinNumRetries) + + // Error counts + errorsSummary := countSummaries["datadog.trace_agent.service_writer.errors"] + assert.True(len(errorsSummary.Calls) >= 3, "There should have been multiple errors count calls") + assert.Equal(expectedNumErrors, errorsSummary.Sum) +} + +func mergeMetadataInOrder(metadatas ...pb.ServicesMetadata) pb.ServicesMetadata { + result := pb.ServicesMetadata{} + + for _, metadata := range metadatas { + for serviceName, serviceMetadata := range metadata { + result[serviceName] = serviceMetadata + } + } + + return result +} + +func calculateMetadataPayloadSize(metadata pb.ServicesMetadata) int64 { + data, _ := json.Marshal(metadata) + return int64(len(data)) +} + +func assertMetadata(assert *assert.Assertions, expectedHeaders map[string]string, + expectedMetadata pb.ServicesMetadata, p *payload) { + servicesMetadata := pb.ServicesMetadata{} + + assert.NoError(json.Unmarshal(p.bytes, &servicesMetadata), "Stats payload should unmarshal correctly") + + assert.Equal(expectedHeaders, p.headers, "Headers should match expectation") + assert.Equal(expectedMetadata, servicesMetadata, "Service metadata should match expectation") +} + +func testServiceWriter() (*ServiceWriter, chan pb.ServicesMetadata, *testEndpoint, *testutil.TestStatsClient) { + serviceChannel := make(chan pb.ServicesMetadata) + conf := &config.AgentConfig{ + ServiceWriterConfig: writerconfig.DefaultServiceWriterConfig(), + } + serviceWriter := NewServiceWriter(conf, serviceChannel) + testEndpoint := &testEndpoint{} + serviceWriter.sender.setEndpoint(testEndpoint) + testStatsClient := metrics.Client.(*testutil.TestStatsClient) + testStatsClient.Reset() + + return serviceWriter, serviceChannel, testEndpoint, testStatsClient +} diff --git a/pkg/trace/writer/stats.go b/pkg/trace/writer/stats.go new file mode 100644 index 0000000000000..e13a0c8e7430f --- /dev/null +++ b/pkg/trace/writer/stats.go @@ -0,0 +1,306 @@ +package writer + +import ( + "strings" + "sync/atomic" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + log "github.com/cihub/seelog" +) + +const pathStats = "/api/v0.2/stats" + +// StatsWriter ingests stats buckets and flushes them to the API. +type StatsWriter struct { + sender payloadSender + exit chan struct{} + + // InStats is the stream of stat buckets to send out. + InStats <-chan []agent.StatsBucket + + // info contains various statistics about the writer, which are + // occasionally sent as metrics to Datadog. + info info.StatsWriterInfo + + // hostName specifies the resolved host name on which the agent is + // running, to be sent as part of a stats payload. + hostName string + + // env is environment this agent is configured with, to be sent as part + // of the stats payload. + env string + + conf writerconfig.StatsWriterConfig +} + +// NewStatsWriter returns a new writer for stats. +func NewStatsWriter(conf *config.AgentConfig, InStats <-chan []agent.StatsBucket) *StatsWriter { + cfg := conf.StatsWriterConfig + endpoints := newEndpoints(conf, pathStats) + sender := newMultiSender(endpoints, cfg.SenderConfig) + log.Infof("Stats writer initializing with config: %+v", cfg) + + return &StatsWriter{ + sender: sender, + exit: make(chan struct{}), + InStats: InStats, + hostName: conf.Hostname, + env: conf.DefaultEnv, + conf: cfg, + } +} + +// Start starts the writer, awaiting stat buckets and flushing them. +func (w *StatsWriter) Start() { + w.sender.Start() + + go func() { + defer watchdog.LogOnPanic() + w.Run() + }() + + go func() { + defer watchdog.LogOnPanic() + w.monitor() + }() +} + +// Run runs the event loop of the writer's main goroutine. It reads stat buckets +// from InStats, builds stat payloads and sends them out using the base writer. +func (w *StatsWriter) Run() { + defer close(w.exit) + + log.Debug("starting stats writer") + + for { + select { + case stats := <-w.InStats: + w.handleStats(stats) + case <-w.exit: + log.Info("exiting stats writer") + return + } + } +} + +// Stop stops the writer +func (w *StatsWriter) Stop() { + w.exit <- struct{}{} + <-w.exit + w.sender.Stop() +} + +func (w *StatsWriter) handleStats(stats []agent.StatsBucket) { + payloads, nbStatBuckets, nbEntries := w.buildPayloads(stats, w.conf.MaxEntriesPerPayload) + if len(payloads) == 0 { + return + } + + log.Debugf("going to flush %v entries in %v stat buckets in %v payloads", + nbEntries, nbStatBuckets, len(payloads), + ) + + if len(payloads) > 1 { + atomic.AddInt64(&w.info.Splits, 1) + } + atomic.AddInt64(&w.info.StatsBuckets, int64(nbStatBuckets)) + + headers := map[string]string{ + languageHeaderKey: strings.Join(info.Languages(), "|"), + "Content-Type": "application/json", + "Content-Encoding": "gzip", + } + + for _, p := range payloads { + // synchronously send the payloads one after the other + data, err := agent.EncodeStatsPayload(p) + if err != nil { + log.Errorf("encoding issue: %v", err) + return + } + + payload := newPayload(data, headers) + w.sender.Send(payload) + + atomic.AddInt64(&w.info.Bytes, int64(len(data))) + } +} + +type timeWindow struct { + start, duration int64 +} + +// buildPayloads returns a set of payload to send out, each paylods guaranteed +// to have the number of stats buckets under the given maximum. +func (w *StatsWriter) buildPayloads(stats []agent.StatsBucket, maxEntriesPerPayloads int) ([]*agent.StatsPayload, int, int) { + if len(stats) == 0 { + return []*agent.StatsPayload{}, 0, 0 + } + + // 1. Get an estimate of how many payloads we need, based on the total + // number of map entries (i.e.: sum of number of items in the stats + // bucket's count map). + // NOTE: we use the number of items in the count map as the + // reference, but in reality, what take place are the + // distributions. We are guaranteed the number of entries in the + // count map is > than the number of entries in the distributions + // maps, so the algorithm is correct, but indeed this means we could + // do better. + nbEntries := 0 + for _, s := range stats { + nbEntries += len(s.Counts) + } + + if maxEntriesPerPayloads <= 0 || nbEntries < maxEntriesPerPayloads { + // nothing to do, break early + return []*agent.StatsPayload{&agent.StatsPayload{ + HostName: w.hostName, + Env: w.env, + Stats: stats, + }}, len(stats), nbEntries + } + + nbPayloads := nbEntries / maxEntriesPerPayloads + if nbEntries%maxEntriesPerPayloads != 0 { + nbPayloads++ + } + + // 2. Create a slice of nbPayloads maps, mapping a time window (stat + + // duration) to a stat bucket. We will build the payloads from these + // maps. This allows is to have one stat bucket per time window. + pMaps := make([]map[timeWindow]agent.StatsBucket, nbPayloads) + for i := 0; i < nbPayloads; i++ { + pMaps[i] = make(map[timeWindow]agent.StatsBucket, nbPayloads) + } + + // 3. Iterate over all entries of each stats. Add the entry to one of + // the payload container mappings, in a round robin fashion. In some + // edge cases, we can end up having the same entry in several + // inputted stat buckets. We must check that we never overwrite an + // entry in the new stats buckets but cleanly merge instead. + i := 0 + for _, b := range stats { + tw := timeWindow{b.Start, b.Duration} + + for ekey, e := range b.Counts { + pm := pMaps[i%nbPayloads] + newsb, ok := pm[tw] + if !ok { + newsb = agent.NewStatsBucket(tw.start, tw.duration) + } + pm[tw] = newsb + + if _, ok := newsb.Counts[ekey]; ok { + newsb.Counts[ekey].Merge(e) + } else { + newsb.Counts[ekey] = e + } + + if _, ok := b.Distributions[ekey]; ok { + if _, ok := newsb.Distributions[ekey]; ok { + newsb.Distributions[ekey].Merge(b.Distributions[ekey]) + } else { + newsb.Distributions[ekey] = b.Distributions[ekey] + } + } + if _, ok := b.ErrDistributions[ekey]; ok { + if _, ok := newsb.ErrDistributions[ekey]; ok { + newsb.ErrDistributions[ekey].Merge(b.ErrDistributions[ekey]) + } else { + newsb.ErrDistributions[ekey] = b.ErrDistributions[ekey] + } + } + i++ + } + } + + // 4. Create the nbPayloads payloads from the maps. + nbStats := 0 + nbEntries = 0 + payloads := make([]*agent.StatsPayload, 0, nbPayloads) + for _, pm := range pMaps { + pstats := make([]agent.StatsBucket, 0, len(pm)) + for _, sb := range pm { + pstats = append(pstats, sb) + nbEntries += len(sb.Counts) + } + payloads = append(payloads, &agent.StatsPayload{ + HostName: w.hostName, + Env: w.env, + Stats: pstats, + }) + + nbStats += len(pstats) + } + return payloads, nbStats, nbEntries +} + +// monitor runs the event loop of the writer's monitoring +// goroutine. It: +// - reads events from the payload sender's monitor channel, logs +// them, send out statsd metrics, and updates the writer info +// - periodically dumps the writer info +func (w *StatsWriter) monitor() { + monC := w.sender.Monitor() + + infoTicker := time.NewTicker(w.conf.UpdateInfoPeriod) + defer infoTicker.Stop() + + for { + select { + case e, ok := <-monC: + if !ok { + break + } + + switch e.typ { + case eventTypeSuccess: + url := e.stats.host + log.Infof("flushed stat payload; url: %s, time:%s, size:%d bytes", url, e.stats.sendTime, + len(e.payload.bytes)) + tags := []string{"url:" + url} + metrics.Gauge("datadog.trace_agent.stats_writer.flush_duration", + e.stats.sendTime.Seconds(), tags, 1) + atomic.AddInt64(&w.info.Payloads, 1) + case eventTypeFailure: + url := e.stats.host + log.Errorf("failed to flush stat payload; url:%s, time:%s, size:%d bytes, error: %s", + url, e.stats.sendTime, len(e.payload.bytes), e.err) + atomic.AddInt64(&w.info.Errors, 1) + case eventTypeRetry: + log.Errorf("retrying flush stat payload, retryNum: %d, delay:%s, error: %s", + e.retryNum, e.retryDelay, e.err) + atomic.AddInt64(&w.info.Retries, 1) + default: + log.Debugf("don't know how to handle event with type %T", e) + } + + case <-infoTicker.C: + var swInfo info.StatsWriterInfo + + // Load counters and reset them for the next flush + swInfo.Payloads = atomic.SwapInt64(&w.info.Payloads, 0) + swInfo.StatsBuckets = atomic.SwapInt64(&w.info.StatsBuckets, 0) + swInfo.Bytes = atomic.SwapInt64(&w.info.Bytes, 0) + swInfo.Retries = atomic.SwapInt64(&w.info.Retries, 0) + swInfo.Splits = atomic.SwapInt64(&w.info.Splits, 0) + swInfo.Errors = atomic.SwapInt64(&w.info.Errors, 0) + + // TODO(gbbr): Scope these stats per endpoint (see (config.AgentConfig).AdditionalEndpoints)) + metrics.Count("datadog.trace_agent.stats_writer.payloads", int64(swInfo.Payloads), nil, 1) + metrics.Count("datadog.trace_agent.stats_writer.stats_buckets", int64(swInfo.StatsBuckets), nil, 1) + metrics.Count("datadog.trace_agent.stats_writer.bytes", int64(swInfo.Bytes), nil, 1) + metrics.Count("datadog.trace_agent.stats_writer.retries", int64(swInfo.Retries), nil, 1) + metrics.Count("datadog.trace_agent.stats_writer.splits", int64(swInfo.Splits), nil, 1) + metrics.Count("datadog.trace_agent.stats_writer.errors", int64(swInfo.Errors), nil, 1) + + info.UpdateStatsWriterInfo(swInfo) + } + } +} diff --git a/pkg/trace/writer/stats_test.go b/pkg/trace/writer/stats_test.go new file mode 100644 index 0000000000000..be01269ab8f89 --- /dev/null +++ b/pkg/trace/writer/stats_test.go @@ -0,0 +1,405 @@ +package writer + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "fmt" + "math" + "math/rand" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/agent" + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + "github.com/stretchr/testify/assert" +) + +func TestStatsWriter_StatHandling(t *testing.T) { + assert := assert.New(t) + + // Given a stats writer, its incoming channel and the endpoint that receives the payloads + statsWriter, statsChannel, testEndpoint, _ := testStatsWriter() + + statsWriter.Start() + + // Given 2 slices of 3 test buckets + testStats1 := []agent.StatsBucket{ + testutil.RandomStatsBucket(3), + testutil.RandomStatsBucket(3), + testutil.RandomStatsBucket(3), + } + testStats2 := []agent.StatsBucket{ + testutil.RandomStatsBucket(3), + testutil.RandomStatsBucket(3), + testutil.RandomStatsBucket(3), + } + + // When sending those slices + statsChannel <- testStats1 + statsChannel <- testStats2 + + // And stopping stats writer + close(statsChannel) + statsWriter.Stop() + + payloads := testEndpoint.SuccessPayloads() + + // Then the endpoint should have received 2 payloads, containing all stat buckets + assert.Len(payloads, 2, "There should be 2 payloads") + + payload1 := payloads[0] + payload2 := payloads[1] + + expectedHeaders := map[string]string{ + "X-Datadog-Reported-Languages": strings.Join(info.Languages(), "|"), + "Content-Type": "application/json", + "Content-Encoding": "gzip", + } + + assertStatsPayload(assert, expectedHeaders, testStats1, payload1) + assertStatsPayload(assert, expectedHeaders, testStats2, payload2) +} + +func TestStatsWriter_UpdateInfoHandling(t *testing.T) { + rand.Seed(1) + assert := assert.New(t) + + // Given a stats writer, its incoming channel and the endpoint that receives the payloads + statsWriter, statsChannel, testEndpoint, statsClient := testStatsWriter() + statsWriter.conf.UpdateInfoPeriod = 100 * time.Millisecond + + statsWriter.Start() + + expectedNumPayloads := int64(0) + expectedNumBuckets := int64(0) + expectedNumBytes := int64(0) + expectedMinNumRetries := int64(0) + expectedNumErrors := int64(0) + + // When sending 1 payload with 3 buckets + expectedNumPayloads++ + payload1Buckets := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + statsChannel <- payload1Buckets + expectedNumBuckets += 3 + expectedNumBytes += calculateStatPayloadSize(payload1Buckets) + + // And another one with another 3 buckets + expectedNumPayloads++ + payload2Buckets := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + statsChannel <- payload2Buckets + expectedNumBuckets += 3 + expectedNumBytes += calculateStatPayloadSize(payload2Buckets) + + // Wait for previous payloads to be sent + time.Sleep(2 * statsWriter.conf.UpdateInfoPeriod) + + // And then sending a third payload with other 3 buckets with an errored out endpoint + testEndpoint.SetError(fmt.Errorf("non retriable error")) + expectedNumErrors++ + payload3Buckets := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + statsChannel <- payload3Buckets + expectedNumBuckets += 3 + expectedNumBytes += calculateStatPayloadSize(payload3Buckets) + + // And waiting for twice the flush period to trigger payload sending and info updating + time.Sleep(2 * statsWriter.conf.UpdateInfoPeriod) + + // And then sending a third payload with other 3 traces with an errored out endpoint with retry + testEndpoint.SetError(&retriableError{ + err: fmt.Errorf("non retriable error"), + endpoint: testEndpoint, + }) + expectedMinNumRetries++ + payload4Buckets := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + statsChannel <- payload4Buckets + expectedNumBuckets += 3 + expectedNumBytes += calculateStatPayloadSize(payload4Buckets) + + // And waiting for twice the flush period to trigger payload sending and info updating + time.Sleep(2 * statsWriter.conf.UpdateInfoPeriod) + + close(statsChannel) + statsWriter.Stop() + + // Then we expect some counts to have been sent to the stats client for each update tick (there should have been + // at least 3 ticks) + countSummaries := statsClient.GetCountSummaries() + + // Payload counts + payloadSummary := countSummaries["datadog.trace_agent.stats_writer.payloads"] + assert.True(len(payloadSummary.Calls) >= 3, "There should have been multiple payload count calls") + assert.Equal(expectedNumPayloads, payloadSummary.Sum) + + // Traces counts + bucketsSummary := countSummaries["datadog.trace_agent.stats_writer.stats_buckets"] + assert.True(len(bucketsSummary.Calls) >= 3, "There should have been multiple stats_buckets count calls") + assert.Equal(expectedNumBuckets, bucketsSummary.Sum) + + // Bytes counts + bytesSummary := countSummaries["datadog.trace_agent.stats_writer.bytes"] + assert.True(len(bytesSummary.Calls) >= 3, "There should have been multiple bytes count calls") + assert.Equal(expectedNumBytes, bytesSummary.Sum) + + // Retry counts + retriesSummary := countSummaries["datadog.trace_agent.stats_writer.retries"] + assert.True(len(retriesSummary.Calls) >= 2, "There should have been multiple retries count calls") + assert.True(retriesSummary.Sum >= expectedMinNumRetries) + + // Error counts + errorsSummary := countSummaries["datadog.trace_agent.stats_writer.errors"] + assert.True(len(errorsSummary.Calls) >= 3, "There should have been multiple errors count calls") + assert.Equal(expectedNumErrors, errorsSummary.Sum) +} + +func TestStatsWriter_BuildPayloads(t *testing.T) { + t.Run("common case, no duplicate entries", func(t *testing.T) { + assert := assert.New(t) + + sw, _, _, _ := testStatsWriter() + + // This gives us a total of 45 entries. 3 per span, 5 + // spans per stat bucket. Each buckets have the same + // time window (start: 0, duration 1e9). + stats := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + + // Remove duplicates so that we have a predictable state. In another + // case we'll test with duplicates. + expectedNbEntries := removeDuplicateEntries(stats) + + expectedNbPayloads := int(math.Ceil(float64(expectedNbEntries) / 12)) + + // Compute our expected number of entries by payload + expectedNbEntriesByPayload := make([]int, expectedNbPayloads) + for i := 0; i < expectedNbEntries; i++ { + expectedNbEntriesByPayload[i%expectedNbPayloads]++ + } + + expectedCounts := countsByEntries(stats) + + payloads, nbStatBuckets, nbEntries := sw.buildPayloads(stats, 12) + + assert.Equal(expectedNbPayloads, len(payloads)) + assert.Equal(expectedNbPayloads, nbStatBuckets) + assert.Equal(expectedNbEntries, nbEntries) + + for i := 0; i < expectedNbPayloads; i++ { + assert.Equal(1, len(payloads[i].Stats)) + assert.Equal(expectedNbEntriesByPayload[i], len(payloads[i].Stats[0].Counts)) + } + + assertCountByEntries(assert, expectedCounts, payloads) + }) + + t.Run("common case, with duplicate entries", func(t *testing.T) { + rand.Seed(55) + assert := assert.New(t) + + sw, _, _, _ := testStatsWriter() + + // This gives us a total of 45 entries. 3 per span, 5 + // spans per stat bucket. Each buckets have the same + // time window (start: 0, duration 1e9). + stats := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + + // Remove duplicates so that we have a predictable + // state. + expectedNbEntries := removeDuplicateEntries(stats) + + // Ensure we have 45 - 2 entries, as we'll duplicate 2 + // of them. + for ekey := range stats[0].Counts { + if expectedNbEntries == 43 { + break + } + + delete(stats[0].Counts, ekey) + expectedNbEntries-- + } + + // Force 2 duplicates + i := 0 + for ekey, e := range stats[0].Counts { + if i >= 2 { + break + } + stats[1].Counts[ekey] = e + i++ + } + + expectedNbPayloads := int(math.Ceil(float64(expectedNbEntries) / 12)) + + // Compute our expected number of entries by payload + expectedNbEntriesByPayload := make([]int, expectedNbPayloads) + for i := 0; i < expectedNbEntries; i++ { + expectedNbEntriesByPayload[i%expectedNbPayloads]++ + } + + expectedCounts := countsByEntries(stats) + + payloads, nbStatBuckets, nbEntries := sw.buildPayloads(stats, 12) + + assert.Equal(expectedNbPayloads, len(payloads)) + assert.Equal(expectedNbPayloads, nbStatBuckets) + assert.Equal(expectedNbEntries, nbEntries) + + for i := 0; i < expectedNbPayloads; i++ { + assert.Equal(1, len(payloads[i].Stats)) + assert.Equal(expectedNbEntriesByPayload[i], len(payloads[i].Stats[0].Counts)) + } + + assertCountByEntries(assert, expectedCounts, payloads) + }) + + t.Run("no need for split", func(t *testing.T) { + rand.Seed(1) + assert := assert.New(t) + + sw, _, _, _ := testStatsWriter() + sw.Start() + + // This gives us a tota of 45 entries. 3 per span, 5 spans per + // stat bucket. Each buckets have the same time window (start: + // 0, duration 1e9). + stats := []agent.StatsBucket{ + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + testutil.RandomStatsBucket(5), + } + + payloads, nbStatBuckets, nbEntries := sw.buildPayloads(stats, 1337) + + assert.Equal(1, len(payloads)) + assert.Equal(3, nbStatBuckets) + assert.Equal(45, nbEntries) + + assert.Equal(3, len(payloads[0].Stats)) + assert.Equal(15, len(payloads[0].Stats[0].Counts)) + assert.Equal(15, len(payloads[0].Stats[1].Counts)) + assert.Equal(15, len(payloads[0].Stats[2].Counts)) + }) +} + +func removeDuplicateEntries(stats []agent.StatsBucket) int { + nbEntries := 0 + entries := make(map[string]struct{}, 45) + for _, s := range stats { + for ekey := range s.Counts { + if _, ok := entries[ekey]; !ok { + entries[ekey] = struct{}{} + nbEntries++ + } else { + delete(s.Counts, ekey) + } + } + } + return nbEntries +} + +func countsByEntries(stats []agent.StatsBucket) map[string]float64 { + counts := make(map[string]float64) + for _, s := range stats { + for k, c := range s.Counts { + v, ok := counts[k] + if !ok { + v = 0 + } + v += c.Value + counts[k] = v + } + } + + return counts +} + +func assertCountByEntries(assert *assert.Assertions, expectedCounts map[string]float64, payloads []*agent.StatsPayload) { + actualCounts := make(map[string]float64) + for _, p := range payloads { + for _, s := range p.Stats { + for ekey, e := range s.Counts { + v, ok := actualCounts[ekey] + if !ok { + v = 0 + } + v += e.Value + actualCounts[ekey] = v + } + } + } + + assert.Equal(expectedCounts, actualCounts) +} + +func calculateStatPayloadSize(buckets []agent.StatsBucket) int64 { + statsPayload := &agent.StatsPayload{ + HostName: testHostName, + Env: testEnv, + Stats: buckets, + } + + data, _ := agent.EncodeStatsPayload(statsPayload) + return int64(len(data)) +} + +func assertStatsPayload(assert *assert.Assertions, headers map[string]string, buckets []agent.StatsBucket, p *payload) { + statsPayload := agent.StatsPayload{} + + reader := bytes.NewBuffer(p.bytes) + gzipReader, err := gzip.NewReader(reader) + + assert.NoError(err, "Gzip reader should work correctly") + + jsonDecoder := json.NewDecoder(gzipReader) + + assert.NoError(jsonDecoder.Decode(&statsPayload), "Stats payload should unmarshal correctly") + + assert.Equal(headers, p.headers, "Headers should match expectation") + assert.Equal(testHostName, statsPayload.HostName, "Hostname should match expectation") + assert.Equal(testEnv, statsPayload.Env, "Env should match expectation") + assert.Equal(buckets, statsPayload.Stats, "Stat buckets should match expectation") +} + +func testStatsWriter() (*StatsWriter, chan []agent.StatsBucket, *testEndpoint, *testutil.TestStatsClient) { + statsChannel := make(chan []agent.StatsBucket) + conf := &config.AgentConfig{ + Hostname: testHostName, + DefaultEnv: testEnv, + StatsWriterConfig: writerconfig.DefaultStatsWriterConfig(), + } + statsWriter := NewStatsWriter(conf, statsChannel) + testEndpoint := &testEndpoint{} + statsWriter.sender.setEndpoint(testEndpoint) + testStatsClient := metrics.Client.(*testutil.TestStatsClient) + testStatsClient.Reset() + + return statsWriter, statsChannel, testEndpoint, testStatsClient +} diff --git a/pkg/trace/writer/trace.go b/pkg/trace/writer/trace.go new file mode 100644 index 0000000000000..a538f39ef4ece --- /dev/null +++ b/pkg/trace/writer/trace.go @@ -0,0 +1,296 @@ +package writer + +import ( + "bytes" + "compress/gzip" + "strings" + "sync/atomic" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + "github.com/DataDog/datadog-agent/pkg/trace/watchdog" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + log "github.com/cihub/seelog" + "github.com/golang/protobuf/proto" +) + +const pathTraces = "/api/v0.2/traces" + +// TracePackage represents the result of a trace sampling operation. +// +// NOTE: A TracePackage can be valid even if any of its fields is nil/empty. In particular, a common case is that of +// empty Trace but non-empty Events. This happens when events are extracted from a trace that wasn't sampled. +type TracePackage struct { + // Trace will contain a trace if it was sampled or be empty if it wasn't. + Trace pb.Trace + // Events contains all APMEvents extracted from a trace. If no events were extracted, it will be empty. + Events []*pb.Span +} + +// Empty returns true if this TracePackage has no data. +func (s *TracePackage) Empty() bool { + return len(s.Trace) == 0 && len(s.Events) == 0 +} + +// TraceWriter ingests sampled traces and flushes them to the API. +type TraceWriter struct { + stats info.TraceWriterInfo + hostName string + env string + conf writerconfig.TraceWriterConfig + in <-chan *TracePackage + + traces []*pb.APITrace + events []*pb.Span + spansInBuffer int + + sender payloadSender + exit chan struct{} +} + +// NewTraceWriter returns a new writer for traces. +func NewTraceWriter(conf *config.AgentConfig, in <-chan *TracePackage) *TraceWriter { + cfg := conf.TraceWriterConfig + endpoints := newEndpoints(conf, pathTraces) + sender := newMultiSender(endpoints, cfg.SenderConfig) + log.Infof("Trace writer initializing with config: %+v", cfg) + + return &TraceWriter{ + conf: cfg, + hostName: conf.Hostname, + env: conf.DefaultEnv, + + traces: []*pb.APITrace{}, + events: []*pb.Span{}, + + in: in, + + sender: sender, + exit: make(chan struct{}), + } +} + +// Start starts the writer. +func (w *TraceWriter) Start() { + w.sender.Start() + go func() { + defer watchdog.LogOnPanic() + w.Run() + }() +} + +// Run runs the main loop of the writer goroutine. It sends traces to the payload constructor, flushing it periodically +// and collects stats which are also reported periodically. +func (w *TraceWriter) Run() { + defer close(w.exit) + + // for now, simply flush every x seconds + flushTicker := time.NewTicker(w.conf.FlushPeriod) + defer flushTicker.Stop() + + updateInfoTicker := time.NewTicker(w.conf.UpdateInfoPeriod) + defer updateInfoTicker.Stop() + + // Monitor sender for events + go func() { + for event := range w.sender.Monitor() { + switch event.typ { + case eventTypeSuccess: + log.Infof("flushed trace payload to the API, time:%s, size:%d bytes", event.stats.sendTime, + len(event.payload.bytes)) + tags := []string{"url:" + event.stats.host} + metrics.Gauge("datadog.trace_agent.trace_writer.flush_duration", + event.stats.sendTime.Seconds(), tags, 1) + atomic.AddInt64(&w.stats.Payloads, 1) + case eventTypeFailure: + log.Errorf("failed to flush trace payload, host:%s, time:%s, size:%d bytes, error: %s", + event.stats.host, event.stats.sendTime, len(event.payload.bytes), event.err) + atomic.AddInt64(&w.stats.Errors, 1) + case eventTypeRetry: + log.Errorf("retrying flush trace payload, retryNum: %d, delay:%s, error: %s", + event.retryNum, event.retryDelay, event.err) + atomic.AddInt64(&w.stats.Retries, 1) + default: + log.Debugf("don't know how to handle event with type %T", event) + } + } + }() + + log.Debug("starting trace writer") + + for { + select { + case sampledTrace := <-w.in: + w.handleSampledTrace(sampledTrace) + case <-flushTicker.C: + log.Debug("Flushing current traces") + w.flush() + case <-updateInfoTicker.C: + go w.updateInfo() + case <-w.exit: + log.Info("exiting trace writer, flushing all remaining traces") + w.flush() + w.updateInfo() + log.Info("Flushed. Exiting") + return + } + } +} + +// Stop stops the main Run loop. +func (w *TraceWriter) Stop() { + w.exit <- struct{}{} + <-w.exit + w.sender.Stop() +} + +func (w *TraceWriter) handleSampledTrace(sampledTrace *TracePackage) { + if sampledTrace == nil || sampledTrace.Empty() { + log.Debug("Ignoring empty sampled trace") + return + } + + trace := sampledTrace.Trace + events := sampledTrace.Events + + n := len(trace) + len(events) + + if w.spansInBuffer > 0 && w.spansInBuffer+n > w.conf.MaxSpansPerPayload { + // If we have data pending and adding the new data would overflow max spans per payload, force a flush + w.flushDueToMaxSpansPerPayload() + } + + w.appendTrace(sampledTrace.Trace) + w.appendEvents(sampledTrace.Events) + + if n > w.conf.MaxSpansPerPayload { + // If what we just added already goes over the limit, report this but lets carry on and flush + atomic.AddInt64(&w.stats.SingleMaxSpans, 1) + w.flushDueToMaxSpansPerPayload() + } +} + +func (w *TraceWriter) appendTrace(trace pb.Trace) { + numSpans := len(trace) + + if numSpans == 0 { + return + } + + log.Tracef("Handling new trace with %d spans: %v", numSpans, trace) + + w.traces = append(w.traces, traceutil.APITrace(trace)) + w.spansInBuffer += numSpans +} + +func (w *TraceWriter) appendEvents(events []*pb.Span) { + for _, event := range events { + log.Tracef("Handling new APM event: %v", event) + w.events = append(w.events, event) + } + + w.spansInBuffer += len(events) +} + +func (w *TraceWriter) flushDueToMaxSpansPerPayload() { + log.Debugf("Flushing because we reached max per payload") + w.flush() +} + +func (w *TraceWriter) flush() { + numTraces := len(w.traces) + numEvents := len(w.events) + + // If no traces, we can't construct anything + if numTraces == 0 && numEvents == 0 { + return + } + + atomic.AddInt64(&w.stats.Traces, int64(numTraces)) + atomic.AddInt64(&w.stats.Events, int64(numEvents)) + atomic.AddInt64(&w.stats.Spans, int64(w.spansInBuffer)) + + tracePayload := pb.TracePayload{ + HostName: w.hostName, + Env: w.env, + Traces: w.traces, + Transactions: w.events, + } + + serialized, err := proto.Marshal(&tracePayload) + if err != nil { + log.Errorf("failed to serialize trace payload, data got dropped, err: %s", err) + w.resetBuffer() + return + } + + encoding := "identity" + + // Try to compress payload before sending + compressionBuffer := bytes.Buffer{} + gz, err := gzip.NewWriterLevel(&compressionBuffer, gzip.BestSpeed) + if err != nil { + log.Errorf("failed to get compressor, sending uncompressed: %s", err) + } else { + _, err := gz.Write(serialized) + gz.Close() + + if err != nil { + log.Errorf("failed to compress payload, sending uncompressed: %s", err) + } else { + serialized = compressionBuffer.Bytes() + encoding = "gzip" + } + } + + atomic.AddInt64(&w.stats.Bytes, int64(len(serialized))) + + headers := map[string]string{ + languageHeaderKey: strings.Join(info.Languages(), "|"), + "Content-Type": "application/x-protobuf", + "Content-Encoding": encoding, + } + + payload := newPayload(serialized, headers) + + log.Debugf("flushing traces=%v events=%v", len(w.traces), len(w.events)) + w.sender.Send(payload) + w.resetBuffer() +} + +func (w *TraceWriter) resetBuffer() { + // Reset traces + w.traces = w.traces[:0] + w.events = w.events[:0] + w.spansInBuffer = 0 +} + +func (w *TraceWriter) updateInfo() { + // TODO(gbbr): Scope these stats per endpoint (see (config.AgentConfig).AdditionalEndpoints)) + var twInfo info.TraceWriterInfo + + // Load counters and reset them for the next flush + twInfo.Payloads = atomic.SwapInt64(&w.stats.Payloads, 0) + twInfo.Traces = atomic.SwapInt64(&w.stats.Traces, 0) + twInfo.Events = atomic.SwapInt64(&w.stats.Events, 0) + twInfo.Spans = atomic.SwapInt64(&w.stats.Spans, 0) + twInfo.Bytes = atomic.SwapInt64(&w.stats.Bytes, 0) + twInfo.Retries = atomic.SwapInt64(&w.stats.Retries, 0) + twInfo.Errors = atomic.SwapInt64(&w.stats.Errors, 0) + twInfo.SingleMaxSpans = atomic.SwapInt64(&w.stats.SingleMaxSpans, 0) + + metrics.Count("datadog.trace_agent.trace_writer.payloads", int64(twInfo.Payloads), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.traces", int64(twInfo.Traces), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.events", int64(twInfo.Events), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.spans", int64(twInfo.Spans), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.bytes", int64(twInfo.Bytes), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.retries", int64(twInfo.Retries), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.errors", int64(twInfo.Errors), nil, 1) + metrics.Count("datadog.trace_agent.trace_writer.single_max_spans", int64(twInfo.SingleMaxSpans), nil, 1) + + info.UpdateTraceWriterInfo(twInfo) +} diff --git a/pkg/trace/writer/trace_test.go b/pkg/trace/writer/trace_test.go new file mode 100644 index 0000000000000..3cc03c9c60076 --- /dev/null +++ b/pkg/trace/writer/trace_test.go @@ -0,0 +1,376 @@ +package writer + +import ( + "bytes" + "compress/gzip" + "fmt" + "math" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/pkg/trace/config" + "github.com/DataDog/datadog-agent/pkg/trace/info" + "github.com/DataDog/datadog-agent/pkg/trace/metrics" + "github.com/DataDog/datadog-agent/pkg/trace/pb" + "github.com/DataDog/datadog-agent/pkg/trace/test/testutil" + "github.com/DataDog/datadog-agent/pkg/trace/traceutil" + writerconfig "github.com/DataDog/datadog-agent/pkg/trace/writer/config" + "github.com/gogo/protobuf/proto" + "github.com/stretchr/testify/assert" +) + +var testHostName = "testhost" +var testEnv = "testenv" + +func TestTraceWriter(t *testing.T) { + t.Run("payload flushing", func(t *testing.T) { + assert := assert.New(t) + + // Create a trace writer, its incoming channel and the endpoint that receives the payloads + traceWriter, traceChannel, testEndpoint, _ := testTraceWriter() + // Set a maximum of 4 spans per payload + traceWriter.conf.MaxSpansPerPayload = 4 + traceWriter.Start() + + // Send a few sampled traces through the writer + sampledTraces := []*TracePackage{ + // These 2 should be grouped together in a single payload + randomTracePackage(1, 1), + randomTracePackage(1, 1), + // This one should be on its own in a single payload + randomTracePackage(3, 1), + // This one should be on its own in a single payload + randomTracePackage(5, 1), + // This one should be on its own in a single payload + randomTracePackage(1, 1), + } + for _, sampledTrace := range sampledTraces { + traceChannel <- sampledTrace + } + + // Stop the trace writer to force everything to flush + close(traceChannel) + traceWriter.Stop() + + expectedHeaders := map[string]string{ + "X-Datadog-Reported-Languages": strings.Join(info.Languages(), "|"), + "Content-Type": "application/x-protobuf", + "Content-Encoding": "gzip", + } + + // Ensure that the number of payloads and their contents match our expectations. The MaxSpansPerPayload we + // set to 4 at the beginning should have been respected whenever possible. + assert.Len(testEndpoint.SuccessPayloads(), 4, "We expected 4 different payloads") + assertPayloads(assert, traceWriter, expectedHeaders, sampledTraces, testEndpoint.SuccessPayloads()) + }) + + t.Run("periodic flushing", func(t *testing.T) { + assert := assert.New(t) + + testFlushPeriod := 100 * time.Millisecond + + // Create a trace writer, its incoming channel and the endpoint that receives the payloads + traceWriter, traceChannel, testEndpoint, _ := testTraceWriter() + // Periodically flushing every 100ms + traceWriter.conf.FlushPeriod = testFlushPeriod + traceWriter.Start() + + // Send a single trace that does not go over the span limit + testSampledTrace := randomTracePackage(2, 2) + traceChannel <- testSampledTrace + + // Wait for twice the flush period + time.Sleep(2 * testFlushPeriod) + + // Check that we received 1 payload that was flushed due to periodical flushing and that it matches the + // data we sent to the writer + receivedPayloads := testEndpoint.SuccessPayloads() + expectedHeaders := map[string]string{ + "X-Datadog-Reported-Languages": strings.Join(info.Languages(), "|"), + "Content-Type": "application/x-protobuf", + "Content-Encoding": "gzip", + } + assert.Len(receivedPayloads, 1, "We expected 1 payload") + assertPayloads(assert, traceWriter, expectedHeaders, []*TracePackage{testSampledTrace}, + testEndpoint.SuccessPayloads()) + + // Wrap up + close(traceChannel) + traceWriter.Stop() + }) + + t.Run("periodic stats reporting", func(t *testing.T) { + assert := assert.New(t) + + testFlushPeriod := 100 * time.Millisecond + + // Create a trace writer, its incoming channel and the endpoint that receives the payloads + traceWriter, traceChannel, testEndpoint, statsClient := testTraceWriter() + traceWriter.conf.FlushPeriod = 100 * time.Millisecond + traceWriter.conf.UpdateInfoPeriod = 100 * time.Millisecond + traceWriter.conf.MaxSpansPerPayload = 10 + traceWriter.Start() + + var ( + expectedNumPayloads int64 + expectedNumSpans int64 + expectedNumTraces int64 + expectedNumBytes int64 + expectedNumErrors int64 + expectedMinNumRetries int64 + expectedNumSingleMaxSpans int64 + ) + + // Send a bunch of sampled traces that should go together in a single payload + payload1SampledTraces := []*TracePackage{ + randomTracePackage(2, 0), + randomTracePackage(2, 0), + randomTracePackage(2, 0), + } + expectedNumPayloads++ + expectedNumSpans += 6 + expectedNumTraces += 3 + expectedNumBytes += calculateTracePayloadSize(payload1SampledTraces) + + for _, sampledTrace := range payload1SampledTraces { + traceChannel <- sampledTrace + } + + // Send a single trace that goes over the span limit + payload2SampledTraces := []*TracePackage{ + randomTracePackage(20, 0), + } + expectedNumPayloads++ + expectedNumSpans += 20 + expectedNumTraces++ + expectedNumBytes += calculateTracePayloadSize(payload2SampledTraces) + expectedNumSingleMaxSpans++ + + for _, sampledTrace := range payload2SampledTraces { + traceChannel <- sampledTrace + } + + // Wait for twice the flush period + time.Sleep(2 * testFlushPeriod) + + // Send a third payload with other 3 traces with an errored out endpoint + testEndpoint.SetError(fmt.Errorf("non retriable error")) + payload3SampledTraces := []*TracePackage{ + randomTracePackage(2, 0), + randomTracePackage(2, 0), + randomTracePackage(2, 0), + } + + expectedNumErrors++ + expectedNumTraces += 3 + expectedNumSpans += 6 + expectedNumBytes += calculateTracePayloadSize(payload3SampledTraces) + + for _, sampledTrace := range payload3SampledTraces { + traceChannel <- sampledTrace + } + + // Wait for twice the flush period + time.Sleep(2 * testFlushPeriod) + + // And then send a fourth payload with other 3 traces with an errored out endpoint but retriable + testEndpoint.SetError(&retriableError{ + err: fmt.Errorf("non retriable error"), + endpoint: testEndpoint, + }) + payload4SampledTraces := []*TracePackage{ + randomTracePackage(2, 0), + randomTracePackage(2, 0), + randomTracePackage(2, 0), + } + + expectedMinNumRetries++ + expectedNumTraces += 3 + expectedNumSpans += 6 + expectedNumBytes += calculateTracePayloadSize(payload4SampledTraces) + + for _, sampledTrace := range payload4SampledTraces { + traceChannel <- sampledTrace + } + + // Wait for twice the flush period to see at least one retry + time.Sleep(2 * testFlushPeriod) + + // Close and stop + close(traceChannel) + traceWriter.Stop() + + // Then we expect some counts to have been sent to the stats client for each update tick (there should have been + // at least 3 ticks) + countSummaries := statsClient.GetCountSummaries() + + // Payload counts + payloadSummary := countSummaries["datadog.trace_agent.trace_writer.payloads"] + assert.True(len(payloadSummary.Calls) >= 3, "There should have been multiple payload count calls") + assert.Equal(expectedNumPayloads, payloadSummary.Sum) + + // Traces counts + tracesSummary := countSummaries["datadog.trace_agent.trace_writer.traces"] + assert.True(len(tracesSummary.Calls) >= 3, "There should have been multiple traces count calls") + assert.Equal(expectedNumTraces, tracesSummary.Sum) + + // Spans counts + spansSummary := countSummaries["datadog.trace_agent.trace_writer.spans"] + assert.True(len(spansSummary.Calls) >= 3, "There should have been multiple spans count calls") + assert.Equal(expectedNumSpans, spansSummary.Sum) + + // Bytes counts + bytesSummary := countSummaries["datadog.trace_agent.trace_writer.bytes"] + assert.True(len(bytesSummary.Calls) >= 3, "There should have been multiple bytes count calls") + // FIXME: Is GZIP non-deterministic? Why won't equal work here? + assert.True(math.Abs(float64(expectedNumBytes-bytesSummary.Sum)) < 100., "Bytes should be within expectations") + + // Retry counts + retriesSummary := countSummaries["datadog.trace_agent.trace_writer.retries"] + assert.True(len(retriesSummary.Calls) >= 2, "There should have been multiple retries count calls") + assert.True(retriesSummary.Sum >= expectedMinNumRetries) + + // Error counts + errorsSummary := countSummaries["datadog.trace_agent.trace_writer.errors"] + assert.True(len(errorsSummary.Calls) >= 3, "There should have been multiple errors count calls") + assert.Equal(expectedNumErrors, errorsSummary.Sum) + + // Single trace max spans + singleMaxSpansSummary := countSummaries["datadog.trace_agent.trace_writer.single_max_spans"] + assert.True(len(singleMaxSpansSummary.Calls) >= 3, "There should have been multiple single max spans count calls") + assert.Equal(expectedNumSingleMaxSpans, singleMaxSpansSummary.Sum) + }) +} + +func calculateTracePayloadSize(sampledTraces []*TracePackage) int64 { + apiTraces := make([]*pb.APITrace, len(sampledTraces)) + + for i, trace := range sampledTraces { + apiTraces[i] = traceutil.APITrace(trace.Trace) + } + + tracePayload := pb.TracePayload{ + HostName: testHostName, + Env: testEnv, + Traces: apiTraces, + } + + serialized, _ := proto.Marshal(&tracePayload) + + compressionBuffer := bytes.Buffer{} + gz, err := gzip.NewWriterLevel(&compressionBuffer, gzip.BestSpeed) + + if err != nil { + panic(err) + } + + _, err = gz.Write(serialized) + gz.Close() + + if err != nil { + panic(err) + } + + return int64(len(compressionBuffer.Bytes())) +} + +func assertPayloads(assert *assert.Assertions, traceWriter *TraceWriter, expectedHeaders map[string]string, + sampledTraces []*TracePackage, payloads []*payload) { + + var expectedTraces []pb.Trace + var expectedEvents []*pb.Span + + for _, sampledTrace := range sampledTraces { + expectedTraces = append(expectedTraces, sampledTrace.Trace) + + for _, event := range sampledTrace.Events { + expectedEvents = append(expectedEvents, event) + } + } + + var expectedTraceIdx int + var expectedEventIdx int + + for _, payload := range payloads { + assert.Equal(expectedHeaders, payload.headers, "Payload headers should match expectation") + + var tracePayload pb.TracePayload + payloadBuffer := bytes.NewBuffer(payload.bytes) + gz, err := gzip.NewReader(payloadBuffer) + assert.NoError(err, "Gzip reader should work correctly") + uncompressedBuffer := bytes.Buffer{} + _, err = uncompressedBuffer.ReadFrom(gz) + gz.Close() + assert.NoError(err, "Should uncompress ok") + assert.NoError(proto.Unmarshal(uncompressedBuffer.Bytes(), &tracePayload), "Unmarshalling should work correctly") + + assert.Equal(testEnv, tracePayload.Env, "Envs should match") + assert.Equal(testHostName, tracePayload.HostName, "Hostnames should match") + + numSpans := 0 + + for _, seenAPITrace := range tracePayload.Traces { + numSpans += len(seenAPITrace.Spans) + + if !assert.True(proto.Equal(traceutil.APITrace(expectedTraces[expectedTraceIdx]), seenAPITrace), + "Unmarshalled trace should match expectation at index %d", expectedTraceIdx) { + return + } + + expectedTraceIdx++ + } + + for _, seenTransaction := range tracePayload.Transactions { + numSpans++ + + if !assert.True(proto.Equal(expectedEvents[expectedEventIdx], seenTransaction), + "Unmarshalled transaction should match expectation at index %d", expectedTraceIdx) { + return + } + + expectedEventIdx++ + } + + // If there's more than 1 trace or transaction in this payload, don't let it go over the limit. Otherwise, + // a single trace+transaction combination is allows to go over the limit. + if len(tracePayload.Traces) > 1 || len(tracePayload.Transactions) > 1 { + assert.True(numSpans <= traceWriter.conf.MaxSpansPerPayload) + } + } +} + +func testTraceWriter() (*TraceWriter, chan *TracePackage, *testEndpoint, *testutil.TestStatsClient) { + payloadChannel := make(chan *TracePackage) + conf := &config.AgentConfig{ + Hostname: testHostName, + DefaultEnv: testEnv, + TraceWriterConfig: writerconfig.DefaultTraceWriterConfig(), + } + traceWriter := NewTraceWriter(conf, payloadChannel) + testEndpoint := &testEndpoint{} + traceWriter.sender.setEndpoint(testEndpoint) + testStatsClient := metrics.Client.(*testutil.TestStatsClient) + testStatsClient.Reset() + + return traceWriter, payloadChannel, testEndpoint, testStatsClient +} + +func randomTracePackage(numSpans, numEvents int) *TracePackage { + if numSpans < numEvents { + panic("can't have more events than spans in a RandomSampledTrace") + } + + trace := testutil.GetTestTrace(1, numSpans, true)[0] + + events := make([]*pb.Span, 0, numEvents) + + for _, span := range trace[:numEvents] { + events = append(events, span) + } + + return &TracePackage{ + Trace: trace, + Events: events, + } +}