Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[apm] standardize peer tag aggregation #20550

Merged
merged 33 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4d6faa4
add new config flag for peer tag aggregation, add peer tag defaults
jdgumz Oct 27, 2023
bfb13e6
add peer_tags_aggregation config
jdgumz Oct 27, 2023
d25c225
update aggregation to work with just peer tags, remove dependency on …
jdgumz Oct 27, 2023
e570487
add tests for instantiating concentrator with different peer tag rela…
jdgumz Oct 27, 2023
9651de3
fix tests for client stats aggregator
jdgumz Oct 30, 2023
92adb54
correct inconsistent naming for peer tags aggregation variable
jdgumz Oct 30, 2023
aa88f6e
fix configuration of peer tags aggregation flag
jdgumz Oct 30, 2023
c484fa6
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
400cb60
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
1e59bed
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
12b1cb2
Update pkg/config/config_template.yaml
jdgumz Oct 31, 2023
1597550
add _dd.base_service to default peer tags
jdgumz Nov 1, 2023
b253320
Merge branch 'apm/standardize-peer-tag-aggregation' of github.com:Dat…
jdgumz Nov 1, 2023
47c84b3
Merge branch 'main' into apm/standardize-peer-tag-aggregation
jdgumz Nov 1, 2023
fc01656
add split_payload field for client stats payload protobuf
jdgumz Nov 1, 2023
1f4037f
Revert "add split_payload field for client stats payload protobuf"
jdgumz Nov 1, 2023
ece387c
add splitPayload field to StatsPayload struct
jdgumz Nov 2, 2023
4933933
mark StatsPayloads with splitPayload field
jdgumz Nov 2, 2023
5d2a44b
add test cases for instantiation of client stats aggregator
jdgumz Nov 2, 2023
ce3e6af
make peer_service a reserved field
jdgumz Nov 2, 2023
c4a643a
remove old PeerService field references
jdgumz Nov 2, 2023
6ffb5db
remove PeerService field references in tests
jdgumz Nov 2, 2023
ee35a7d
add new benchmark
jdgumz Nov 2, 2023
9e2c25f
revamp benchmark to account for peer tags
jdgumz Nov 2, 2023
d7cf0bf
correct logic for marking payloads as split
jdgumz Nov 3, 2023
93b6441
appease linter
jdgumz Nov 3, 2023
a34803e
add db.system to defaults for peer tags
jdgumz Nov 3, 2023
0bcc362
allow either peer service aggregation or peer tags aggregation flag t…
jdgumz Nov 4, 2023
368559d
ensure that we only retrieve peer tag when its value is non-empty
jdgumz Nov 6, 2023
236db04
update default peer tags
jdgumz Nov 7, 2023
c34a69d
additional updates for peer tags list
jdgumz Nov 7, 2023
4f4546e
revise documentation for flags related to peer tags aggregation
jdgumz Nov 7, 2023
26bc7ff
add release notes
jdgumz Nov 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions comp/trace/config/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ func applyDatadogConfig(c *config.AgentConfig, core corecompcfg.Component) error
c.ConnectionLimit = core.GetInt("apm_config.connection_limit")
}
c.PeerServiceAggregation = core.GetBool("apm_config.peer_service_aggregation")
if c.PeerServiceAggregation {
log.Warn("`apm_config.peer_service_aggregation` is deprecated, please use `apm_config.peer_tags_aggregation` instead")
}
c.PeerTagsAggregation = core.GetBool("apm_config.peer_tags_aggregation")
c.ComputeStatsBySpanKind = core.GetBool("apm_config.compute_stats_by_span_kind")
if core.IsSet("apm_config.peer_tags") {
c.PeerTags = core.GetStringSlice("apm_config.peer_tags")
Expand Down
1 change: 1 addition & 0 deletions pkg/config/apm.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func setupAPM(config Config) {
config.BindEnvAndSetDefault("apm_config.windows_pipe_security_descriptor", "D:AI(A;;GA;;;WD)", "DD_APM_WINDOWS_PIPE_SECURITY_DESCRIPTOR") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.remote_tagger", true, "DD_APM_REMOTE_TAGGER") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.peer_service_aggregation", false, "DD_APM_PEER_SERVICE_AGGREGATION") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.peer_tags_aggregation", false, "DD_APM_PEER_TAGS_AGGREGATION") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.compute_stats_by_span_kind", false, "DD_APM_COMPUTE_STATS_BY_SPAN_KIND") //nolint:errcheck
config.BindEnvAndSetDefault("apm_config.instrumentation.enabled", false, "DD_APM_INSTRUMENTATION_ENABLED")
config.BindEnvAndSetDefault("apm_config.instrumentation.enabled_namespaces", []string{}, "DD_APM_INSTRUMENTATION_ENABLED_NAMESPACES")
Expand Down
19 changes: 15 additions & 4 deletions pkg/config/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1502,19 +1502,30 @@ api_key:

## @param peer_service_aggregation - bool - default: false
## @env DD_APM_PEER_SERVICE_AGGREGATION - bool - default: false
## [BETA] Enables `peer.service` aggregation in the agent. If disabled, aggregated trace stats will not include `peer.service` as a dimension.
## DEPRECATED - please use `peer_tags_aggregation` instead.
## Enables `peer.service` aggregation in the Agent. If disabled, aggregated trace stats will not include `peer.service` as a dimension.
## For the best experience with `peer.service`, it is recommended to also enable `compute_stats_by_span_kind`.
## If enabling both causes the Agent to consume too many resources, try disabling `compute_stats_by_span_kind` first.
## If the overhead remains high, it will be due to a high cardinality of `peer.service` values from the traces. You may need to check your instrumentation.
## NOTE: If you are using an OTel tracer it's best to have both enabled because client/producer spans with a `peer.service` value
## may not be marked by the Agent as top-level spans.
# peer_service_aggregation: false

## @param peer_tags_aggregation - bool - default: false
## @env DD_APM_PEER_TAGS_AGGREGATION - bool - default: false
## [BETA] Enables aggregation of peer related tags (e.g., `peer.service`, `db.instance`, etc.) in the Agent.
## If disabled, aggregated trace stats will not include these tags as dimensions on trace metrics.
## For the best experience, Datadog also recommends enabling `compute_stats_by_span_kind`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presumably we recommend enabling compute_stats_by_span_kind if peer_tags_aggregation is enabled?

Do we recommend enabling peer_tags_aggregation?

Will people reading this know what it means?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll rephrase. The idea here is that if you're using peer_tags_aggregation, you will likely also want the span kind flag enabled too.

## If enabling both causes the Agent to consume too many resources, try disabling `compute_stats_by_span_kind` first.
## If the overhead remains high, it will be due to a high cardinality of peer tags from the traces. You may need to check your instrumentation.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What should they be looking for in their instrumentation? Can we link them any documentation here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can provide some more guidance in this comment. We do not have clear documentation that would speak to this specific concern.

## NOTE: If you are using an OTel tracer, it's best to have both enabled because client/producer spans with a `peer.service` value
## may not be marked by the Agent as top-level spans.
# peer_tags_aggregation: false

## @param peer_tags - list of strings - optional
## @env DD_APM_PEER_TAGS - list of strings - optional
## [BETA] Enables additional tags related to peer.service. Will not be used unless peer_service_aggregation is also set.
## NOTE: This option is recommended to get the most granular tagging alongside `peer.service`, but it will
## also increase the computational overhead of aggregation. This list can be omitted if that cost is too high for your agent.
## [BETA] Optional list of supplementary peer tags that go beyond the defaults. The Datadog backend validates all tags
## and will drop ones that are unapproved.
# peer_tags: []

## @param features - list of strings - optional
Expand Down
29 changes: 29 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,26 @@ apm_config:
require.False(t, testConfig.GetBool("apm_config.peer_service_aggregation"))
}

func TestEnablePeerTagsAggregationYAML(t *testing.T) {
datadogYaml := `
apm_config:
peer_tags_aggregation: true
`
testConfig := SetupConfFromYAML(datadogYaml)
err := setupFipsEndpoints(testConfig)
require.NoError(t, err)
require.True(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))

datadogYaml = `
apm_config:
peer_tags_aggregation: false
`
testConfig = SetupConfFromYAML(datadogYaml)
err = setupFipsEndpoints(testConfig)
require.NoError(t, err)
require.False(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
}

func TestEnablePeerServiceStatsAggregationEnv(t *testing.T) {
t.Setenv("DD_APM_PEER_SERVICE_AGGREGATION", "true")
testConfig := SetupConfFromYAML("")
Expand All @@ -868,6 +888,15 @@ func TestEnablePeerServiceStatsAggregationEnv(t *testing.T) {
require.False(t, testConfig.GetBool("apm_config.peer_service_aggregation"))
}

func TestEnablePeerTagsAggregationEnv(t *testing.T) {
t.Setenv("DD_APM_PEER_TAGS_AGGREGATION", "true")
testConfig := SetupConfFromYAML("")
require.True(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
t.Setenv("DD_APM_PEER_TAGS_AGGREGATION", "false")
testConfig = SetupConfFromYAML("")
require.False(t, testConfig.GetBool("apm_config.peer_tags_aggregation"))
}

func TestEnableStatsComputationBySpanKindYAML(t *testing.T) {
datadogYaml := `
apm_config:
Expand Down
1 change: 1 addition & 0 deletions pkg/flare/envvars.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ var allowedEnvvarNames = []string{
"DD_APM_REMOTE_TAGGER",
"DD_APM_PEER_SERVICE_AGGREGATION",
"DD_APM_COMPUTE_STATS_BY_SPAN_KIND",
"DD_APM_PEER_TAGS_AGGREGATION",
"DD_APM_PEER_TAGS",
"DD_APM_MAX_CATALOG_SERVICES",
"DD_APM_RECEIVER_TIMEOUT",
Expand Down
9 changes: 6 additions & 3 deletions pkg/proto/datadog/trace/stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ message StatsPayload {
repeated ClientStatsPayload stats = 3;
string agentVersion = 4;
bool clientComputed = 5;
// splitPayload indicates if the payload is actually one of several payloads split out from a larger payload.
// This field can be used in the backend to signal if re-aggregation is necessary.
bool splitPayload = 6;
}

// ClientStatsPayload is the first layer of span stats aggregation. It is also
Expand Down Expand Up @@ -70,9 +73,9 @@ message ClientGroupedStats {
bytes errorSummary = 11; // ddsketch summary of error spans latencies encoded in protobuf
bool synthetics = 12; // set to true on spans generated by synthetics traffic
uint64 topLevelHits = 13; // count of top level spans aggregated in the groupedstats
string peer_service = 14; // name of the remote service that the `service` communicated with
reserved 14; // peer_service has been deprecated
string span_kind = 15; // value of the span.kind tag on the span
// peer_tags are supplementary tags that further describe a peer_service
// E.g., `aws.s3.bucket` gives a specific bucket for `peer_service = aws-s3`
// peer_tags are supplementary tags that further describe a peer entity
// E.g., `grpc.target` to describe the name of a gRPC peer, or `db.hostname` to describe the name of peer DB
repeated string peer_tags = 16;
}
Loading