Skip to content

Commit

Permalink
[apm] standardize peer tag aggregation (#20550)
Browse files Browse the repository at this point in the history
* add new config flag for peer tag aggregation, add peer tag defaults

* add peer_tags_aggregation config

* update aggregation to work with just peer tags, remove dependency on specific peer.service field

* add tests for instantiating concentrator with different peer tag related configurations

* fix tests for client stats aggregator

* correct inconsistent naming for peer tags aggregation variable

* fix configuration of peer tags aggregation flag

* Update pkg/config/config_template.yaml

Co-authored-by: May Lee <[email protected]>

* Update pkg/config/config_template.yaml

Co-authored-by: May Lee <[email protected]>

* Update pkg/config/config_template.yaml

Co-authored-by: May Lee <[email protected]>

* Update pkg/config/config_template.yaml

Co-authored-by: May Lee <[email protected]>

* add _dd.base_service to default peer tags

* add split_payload field for client stats payload protobuf

* Revert "add split_payload field for client stats payload protobuf"

This reverts commit fc01656.

* add splitPayload field to StatsPayload struct

* mark StatsPayloads with splitPayload field

* add test cases for instantiation of client stats aggregator

* make peer_service a reserved field

* remove old PeerService field references

* remove PeerService field references in tests

* add new benchmark

* revamp benchmark to account for peer tags

* correct logic for marking payloads as split

* appease linter

* add db.system to defaults for peer tags

* allow either peer service aggregation or peer tags aggregation flag to enable all default peer tags

* ensure that we only retrieve peer tag when its value is non-empty

* update default peer tags

* additional updates for peer tags list

* revise documentation for flags related to peer tags aggregation

* add release notes

---------

Co-authored-by: May Lee <[email protected]>
  • Loading branch information
jdgumz and maycmlee authored Nov 8, 2023
1 parent eb09813 commit e43ee12
Show file tree
Hide file tree
Showing 21 changed files with 585 additions and 676 deletions.
4 changes: 4 additions & 0 deletions comp/trace/config/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@ func applyDatadogConfig(c *config.AgentConfig, core corecompcfg.Component) error
c.ConnectionLimit = core.GetInt("apm_config.connection_limit")
}
c.PeerServiceAggregation = core.GetBool("apm_config.peer_service_aggregation")
if c.PeerServiceAggregation {
log.Warn("`apm_config.peer_service_aggregation` is deprecated, please use `apm_config.peer_tags_aggregation` instead")
}
c.PeerTagsAggregation = core.GetBool("apm_config.peer_tags_aggregation")
c.ComputeStatsBySpanKind = core.GetBool("apm_config.compute_stats_by_span_kind")
if core.IsSet("apm_config.peer_tags") {
c.PeerTags = core.GetStringSlice("apm_config.peer_tags")
Expand Down
1 change: 1 addition & 0 deletions pkg/config/apm.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func setupAPM(config Config) {
config.BindEnvAndSetDefault("apm_config.windows_pipe_security_descriptor", "D:AI(A;;GA;;;WD)", "DD_APM_WINDOWS_PIPE_SECURITY_DESCRIPTOR") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.remote_tagger", true, "DD_APM_REMOTE_TAGGER") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.peer_service_aggregation", false, "DD_APM_PEER_SERVICE_AGGREGATION") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.peer_tags_aggregation", false, "DD_APM_PEER_TAGS_AGGREGATION") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.compute_stats_by_span_kind", false, "DD_APM_COMPUTE_STATS_BY_SPAN_KIND") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.instrumentation.enabled", false, "DD_APM_INSTRUMENTATION_ENABLED")
config.BindEnvAndSetDefault("apm_config.instrumentation.enabled_namespaces", []string{}, "DD_APM_INSTRUMENTATION_ENABLED_NAMESPACES")
Expand Down
25 changes: 16 additions & 9 deletions pkg/config/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1502,19 +1502,26 @@ api_key:

## @param peer_service_aggregation - bool - default: false
## @env DD_APM_PEER_SERVICE_AGGREGATION - bool - default: false
## [BETA] Enables `peer.service` aggregation in the agent. If disabled, aggregated trace stats will not include `peer.service` as a dimension.
## For the best experience with `peer.service`, it is recommended to also enable `compute_stats_by_span_kind`.
## If enabling both causes the Agent to consume too many resources, try disabling `compute_stats_by_span_kind` first.
## If the overhead remains high, it will be due to a high cardinality of `peer.service` values from the traces. You may need to check your instrumentation.
## NOTE: If you are using an OTel tracer it's best to have both enabled because client/producer spans with a `peer.service` value
## may not be marked by the Agent as top-level spans.
## DEPRECATED - please use `peer_tags_aggregation` instead.
# peer_service_aggregation: false

## @param peer_tags_aggregation - bool - default: false
## @env DD_APM_PEER_TAGS_AGGREGATION - bool - default: false
## [BETA] Enables aggregation of peer related tags (e.g., `peer.service`, `db.instance`, etc.) in the Agent.
## If disabled, aggregated trace stats will not include these tags as dimensions on trace metrics.
## For the best experience with peer tags, Datadog also recommends enabling `compute_stats_by_span_kind`.
## If you are using an OTel tracer, it's best to have both enabled because client/producer spans with relevant peer tags
## may not be marked by the Agent as top-level spans.
## If enabling both causes the Agent to consume too many resources, try disabling `compute_stats_by_span_kind` first.
## A high cardinality of peer tags or APM resources can also contribute to higher CPU and memory consumption.
## You can check for the cardinality of these fields by making trace search queries in the Datadog UI.
## The default list of peer tags can be found in pkg/trace/stats/concentrator.go.
# peer_tags_aggregation: false

## @param peer_tags - list of strings - optional
## @env DD_APM_PEER_TAGS - list of strings - optional
## [BETA] Enables additional tags related to peer.service. Will not be used unless peer_service_aggregation is also set.
## NOTE: This option is recommended to get the most granular tagging alongside `peer.service`, but it will
## also increase the computational overhead of aggregation. This list can be omitted if that cost is too high for your agent.
## [BETA] Optional list of supplementary peer tags that go beyond the defaults. The Datadog backend validates all tags
## and will drop ones that are unapproved.
# peer_tags: []

## @param features - list of strings - optional
Expand Down
29 changes: 29 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,26 @@ apm_config:
require.False(t, testConfig.GetBool("apm_config.peer_service_aggregation"))
}

func TestEnablePeerTagsAggregationYAML(t *testing.T) {
datadogYaml := `
apm_config:
peer_tags_aggregation: true
`
testConfig := SetupConfFromYAML(datadogYaml)
err := setupFipsEndpoints(testConfig)
require.NoError(t, err)
require.True(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))

datadogYaml = `
apm_config:
peer_tags_aggregation: false
`
testConfig = SetupConfFromYAML(datadogYaml)
err = setupFipsEndpoints(testConfig)
require.NoError(t, err)
require.False(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
}

func TestEnablePeerServiceStatsAggregationEnv(t *testing.T) {
t.Setenv("DD_APM_PEER_SERVICE_AGGREGATION", "true")
testConfig := SetupConfFromYAML("")
Expand All @@ -879,6 +899,15 @@ func TestEnablePeerServiceStatsAggregationEnv(t *testing.T) {
require.False(t, testConfig.GetBool("apm_config.peer_service_aggregation"))
}

func TestEnablePeerTagsAggregationEnv(t *testing.T) {
t.Setenv("DD_APM_PEER_TAGS_AGGREGATION", "true")
testConfig := SetupConfFromYAML("")
require.True(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
t.Setenv("DD_APM_PEER_TAGS_AGGREGATION", "false")
testConfig = SetupConfFromYAML("")
require.False(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
}

func TestEnableStatsComputationBySpanKindYAML(t *testing.T) {
datadogYaml := `
apm_config:
Expand Down
1 change: 1 addition & 0 deletions pkg/flare/envvars.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ var allowedEnvvarNames = []string{
"DD_APM_REMOTE_TAGGER",
"DD_APM_PEER_SERVICE_AGGREGATION",
"DD_APM_COMPUTE_STATS_BY_SPAN_KIND",
"DD_APM_PEER_TAGS_AGGREGATION",
"DD_APM_PEER_TAGS",
"DD_APM_MAX_CATALOG_SERVICES",
"DD_APM_RECEIVER_TIMEOUT",
Expand Down
9 changes: 6 additions & 3 deletions pkg/proto/datadog/trace/stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ message StatsPayload {
repeated ClientStatsPayload stats = 3;
string agentVersion = 4;
bool clientComputed = 5;
// splitPayload indicates if the payload is actually one of several payloads split out from a larger payload.
// This field can be used in the backend to signal if re-aggregation is necessary.
bool splitPayload = 6;
}

// ClientStatsPayload is the first layer of span stats aggregation. It is also
Expand Down Expand Up @@ -70,9 +73,9 @@ message ClientGroupedStats {
bytes errorSummary = 11; // ddsketch summary of error spans latencies encoded in protobuf
bool synthetics = 12; // set to true on spans generated by synthetics traffic
uint64 topLevelHits = 13; // count of top level spans aggregated in the groupedstats
string peer_service = 14; // name of the remote service that the `service` communicated with
reserved 14; // peer_service has been deprecated
string span_kind = 15; // value of the span.kind tag on the span
// peer_tags are supplementary tags that further describe a peer_service
// E.g., `aws.s3.bucket` gives a specific bucket for `peer_service = aws-s3`
// peer_tags are supplementary tags that further describe a peer entity
// E.g., `grpc.target` to describe the name of a gRPC peer, or `db.hostname` to describe the name of peer DB
repeated string peer_tags = 16;
}
Loading

0 comments on commit e43ee12

Please sign in to comment.