Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[apm] standardize peer tag aggregation #20550

Merged
merged 33 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4d6faa4
add new config flag for peer tag aggregation, add peer tag defaults
jdgumz Oct 27, 2023
bfb13e6
add peer_tags_aggregation config
jdgumz Oct 27, 2023
d25c225
update aggregation to work with just peer tags, remove dependency on …
jdgumz Oct 27, 2023
e570487
add tests for instantiating concentrator with different peer tag rela…
jdgumz Oct 27, 2023
9651de3
fix tests for client stats aggregator
jdgumz Oct 30, 2023
92adb54
correct inconsistent naming for peer tags aggregation variable
jdgumz Oct 30, 2023
aa88f6e
fix configuration of peer tags aggregation flag
jdgumz Oct 30, 2023
c484fa6
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
400cb60
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
1e59bed
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
12b1cb2
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
1597550
add _dd.base_service to default peer tags
jdgumz Nov 1, 2023
b253320
Merge branch 'apm/standardize-peer-tag-aggregation' of github.com:Dat…
jdgumz Nov 1, 2023
47c84b3
Merge branch 'main' into apm/standardize-peer-tag-aggregation
jdgumz Nov 1, 2023
fc01656
add split_payload field for client stats payload protobuf
jdgumz Nov 1, 2023
1f4037f
Revert "add split_payload field for client stats payload protobuf"
jdgumz Nov 1, 2023
ece387c
add splitPayload field to StatsPayload struct
jdgumz Nov 2, 2023
4933933
mark StatsPayloads with splitPayload field
jdgumz Nov 2, 2023
5d2a44b
add test cases for instantiation of client stats aggregator
jdgumz Nov 2, 2023
ce3e6af
make peer_service a reserved field
jdgumz Nov 2, 2023
c4a643a
remove old PeerService field references
jdgumz Nov 2, 2023
6ffb5db
remove PeerService field references in tests
jdgumz Nov 2, 2023
ee35a7d
add new benchmark
jdgumz Nov 2, 2023
9e2c25f
revamp benchmark to account for peer tags
jdgumz Nov 2, 2023
d7cf0bf
correct logic for marking payloads as split
jdgumz Nov 3, 2023
93b6441
appease linter
jdgumz Nov 3, 2023
a34803e
add db.system to defaults for peer tags
jdgumz Nov 3, 2023
0bcc362
allow either peer service aggregation or peer tags aggregation flag t…
jdgumz Nov 4, 2023
368559d
ensure that we only retrieve peer tag when its value is non-empty
jdgumz Nov 6, 2023
236db04
update default peer tags
jdgumz Nov 7, 2023
c34a69d
additional updates for peer tags list
jdgumz Nov 7, 2023
4f4546e
revise documentation for flags related to peer tags aggregation
jdgumz Nov 7, 2023
26bc7ff
add release notes
jdgumz Nov 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions comp/trace/config/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ func applyDatadogConfig(c *config.AgentConfig, core corecompcfg.Component) error
c.ConnectionLimit = core.GetInt("apm_config.connection_limit")
}
c.PeerServiceAggregation = core.GetBool("apm_config.peer_service_aggregation")
if c.PeerServiceAggregation {
log.Warn("`apm_config.peer_service_aggregation` is deprecated, please use `apm_config.peer_tags_aggregation` instead")
}
c.PeerTagsAggregation = core.GetBool("apm_config.peer_tags_aggregation")
c.ComputeStatsBySpanKind = core.GetBool("apm_config.compute_stats_by_span_kind")
if core.IsSet("apm_config.peer_tags") {
c.PeerTags = core.GetStringSlice("apm_config.peer_tags")
Expand Down
1 change: 1 addition & 0 deletions pkg/config/apm.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func setupAPM(config Config) {
config.BindEnvAndSetDefault("apm_config.windows_pipe_security_descriptor", "D:AI(A;;GA;;;WD)", "DD_APM_WINDOWS_PIPE_SECURITY_DESCRIPTOR") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.remote_tagger", true, "DD_APM_REMOTE_TAGGER") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.peer_service_aggregation", false, "DD_APM_PEER_SERVICE_AGGREGATION") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.peer_tags_aggregation", false, "DD_APM_PEER_TAGS_AGGREGATION") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.compute_stats_by_span_kind", false, "DD_APM_COMPUTE_STATS_BY_SPAN_KIND") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.instrumentation.enabled", false, "DD_APM_INSTRUMENTATION_ENABLED")
config.BindEnvAndSetDefault("apm_config.instrumentation.enabled_namespaces", []string{}, "DD_APM_INSTRUMENTATION_ENABLED_NAMESPACES")
Expand Down
25 changes: 16 additions & 9 deletions pkg/config/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1502,19 +1502,26 @@ api_key:

## @param peer_service_aggregation - bool - default: false
## @env DD_APM_PEER_SERVICE_AGGREGATION - bool - default: false
## [BETA] Enables `peer.service` aggregation in the agent. If disabled, aggregated trace stats will not include `peer.service` as a dimension.
## For the best experience with `peer.service`, it is recommended to also enable `compute_stats_by_span_kind`.
## If enabling both causes the Agent to consume too many resources, try disabling `compute_stats_by_span_kind` first.
## If the overhead remains high, it will be due to a high cardinality of `peer.service` values from the traces. You may need to check your instrumentation.
## NOTE: If you are using an OTel tracer it's best to have both enabled because client/producer spans with a `peer.service` value
## may not be marked by the Agent as top-level spans.
## DEPRECATED - please use `peer_tags_aggregation` instead.
# peer_service_aggregation: false

## @param peer_tags_aggregation - bool - default: false
## @env DD_APM_PEER_TAGS_AGGREGATION - bool - default: false
## [BETA] Enables aggregation of peer related tags (e.g., `peer.service`, `db.instance`, etc.) in the Agent.
## If disabled, aggregated trace stats will not include these tags as dimensions on trace metrics.
## For the best experience with peer tags, Datadog also recommends enabling `compute_stats_by_span_kind`.
## If you are using an OTel tracer, it's best to have both enabled because client/producer spans with relevant peer tags
## may not be marked by the Agent as top-level spans.
## If enabling both causes the Agent to consume too many resources, try disabling `compute_stats_by_span_kind` first.
## A high cardinality of peer tags or APM resources can also contribute to higher CPU and memory consumption.
## You can check for the cardinality of these fields by making trace search queries in the Datadog UI.
## The default list of peer tags can be found in pkg/trace/stats/concentrator.go.
# peer_tags_aggregation: false

## @param peer_tags - list of strings - optional
## @env DD_APM_PEER_TAGS - list of strings - optional
## [BETA] Enables additional tags related to peer.service. Will not be used unless peer_service_aggregation is also set.
## NOTE: This option is recommended to get the most granular tagging alongside `peer.service`, but it will
## also increase the computational overhead of aggregation. This list can be omitted if that cost is too high for your agent.
## [BETA] Optional list of supplementary peer tags that go beyond the defaults. The Datadog backend validates all tags
## and will drop ones that are unapproved.
# peer_tags: []

## @param features - list of strings - optional
Expand Down
29 changes: 29 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,26 @@ apm_config:
require.False(t, testConfig.GetBool("apm_config.peer_service_aggregation"))
}

func TestEnablePeerTagsAggregationYAML(t *testing.T) {
datadogYaml := `
apm_config:
peer_tags_aggregation: true
`
testConfig := SetupConfFromYAML(datadogYaml)
err := setupFipsEndpoints(testConfig)
require.NoError(t, err)
require.True(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))

datadogYaml = `
apm_config:
peer_tags_aggregation: false
`
testConfig = SetupConfFromYAML(datadogYaml)
err = setupFipsEndpoints(testConfig)
require.NoError(t, err)
require.False(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
}

func TestEnablePeerServiceStatsAggregationEnv(t *testing.T) {
t.Setenv("DD_APM_PEER_SERVICE_AGGREGATION", "true")
testConfig := SetupConfFromYAML("")
Expand All @@ -868,6 +888,15 @@ func TestEnablePeerServiceStatsAggregationEnv(t *testing.T) {
require.False(t, testConfig.GetBool("apm_config.peer_service_aggregation"))
}

func TestEnablePeerTagsAggregationEnv(t *testing.T) {
t.Setenv("DD_APM_PEER_TAGS_AGGREGATION", "true")
testConfig := SetupConfFromYAML("")
require.True(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
t.Setenv("DD_APM_PEER_TAGS_AGGREGATION", "false")
testConfig = SetupConfFromYAML("")
require.False(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
}

func TestEnableStatsComputationBySpanKindYAML(t *testing.T) {
datadogYaml := `
apm_config:
Expand Down
1 change: 1 addition & 0 deletions pkg/flare/envvars.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ var allowedEnvvarNames = []string{
"DD_APM_REMOTE_TAGGER",
"DD_APM_PEER_SERVICE_AGGREGATION",
"DD_APM_COMPUTE_STATS_BY_SPAN_KIND",
"DD_APM_PEER_TAGS_AGGREGATION",
"DD_APM_PEER_TAGS",
"DD_APM_MAX_CATALOG_SERVICES",
"DD_APM_RECEIVER_TIMEOUT",
Expand Down
9 changes: 6 additions & 3 deletions pkg/proto/datadog/trace/stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ message StatsPayload {
repeated ClientStatsPayload stats = 3;
string agentVersion = 4;
bool clientComputed = 5;
// splitPayload indicates if the payload is actually one of several payloads split out from a larger payload.
// This field can be used in the backend to signal if re-aggregation is necessary.
bool splitPayload = 6;
}

// ClientStatsPayload is the first layer of span stats aggregation. It is also
Expand Down Expand Up @@ -70,9 +73,9 @@ message ClientGroupedStats {
bytes errorSummary = 11; // ddsketch summary of error spans latencies encoded in protobuf
bool synthetics = 12; // set to true on spans generated by synthetics traffic
uint64 topLevelHits = 13; // count of top level spans aggregated in the groupedstats
string peer_service = 14; // name of the remote service that the `service` communicated with
reserved 14; // peer_service has been deprecated
string span_kind = 15; // value of the span.kind tag on the span
// peer_tags are supplementary tags that further describe a peer_service
// E.g., `aws.s3.bucket` gives a specific bucket for `peer_service = aws-s3`
// peer_tags are supplementary tags that further describe a peer entity
// E.g., `grpc.target` to describe the name of a gRPC peer, or `db.hostname` to describe the name of peer DB
repeated string peer_tags = 16;
}
Loading