Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for directly generating span graphs #745

Merged
merged 22 commits into from
Apr 18, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add separate option as per review comments
grcevski committed Apr 17, 2024
commit 0f591bba8e8c94758cd349ed4efb5c168153919a
10 changes: 8 additions & 2 deletions docs/sources/configure/options.md
Original file line number Diff line number Diff line change
@@ -785,7 +785,10 @@ of Beyla: application-level metrics or network metrics.
process matching the entries in the `discovery` section.
- If the list contains `application_span`, the Beyla OpenTelemetry exporter exports application-level trace span metrics;
but only if there is defined an OpenTelemetry endpoint, and Beyla was able to discover any
process matching the entries in the `discovery` section. When this option is enabled, service graph metrics are also generated.
process matching the entries in the `discovery` section.
- If the list contains `application_service_graph`, the Beyla OpenTelemetry exporter exports application-level service graph metrics;
but only if there is defined an OpenTelemetry endpoint, and Beyla was able to discover any
process matching the entries in the `discovery` section.
For best experience with generating service graph metrics, use a DNS for service discovery and make sure the DNS names match
the OpenTelemetry service names used in Beyla. In Kubernetes environments, the OpenTelemetry service name set by the service name
discovery is the best choice for service graph metrics.
@@ -1098,7 +1101,10 @@ of Beyla: application-level metrics or network metrics.
process matching the entries in the `discovery` section.
- If the list contains `application_span`, the Beyla Prometheus exporter exports application-level metrics in traces span metrics format;
but only if the Prometheus `port` property is defined, and Beyla was able to discover any
process matching the entries in the `discovery` section. When this option is enabled, service graph metrics are also generated.
process matching the entries in the `discovery` section.
- If the list contains `application_service_graph`, the Beyla Prometheus exporter exports application-level service graph metrics;
but only if the Prometheus `port` property is defined, and Beyla was able to discover any
process matching the entries in the `discovery` section.
For best experience with generating service graph metrics, use a DNS for service discovery and make sure the DNS names match
the OpenTelemetry service names used in Beyla. In Kubernetes environments, the OpenTelemetry service name set by the service name
discovery is the best choice for service graph metrics.
42 changes: 40 additions & 2 deletions pkg/internal/export/otel/metrics.go
Original file line number Diff line number Diff line change
@@ -56,6 +56,7 @@ const (
FeatureNetwork = "network"
FeatureApplication = "application"
FeatureSpan = "application_span"
FeatureGraph = "application_service_graph"
)

type MetricsConfig struct {
@@ -132,12 +133,16 @@ func (m MetricsConfig) SpanMetricsEnabled() bool {
return slices.Contains(m.Features, FeatureSpan)
}

func (m MetricsConfig) ServiceGraphMetricsEnabled() bool {
return slices.Contains(m.Features, FeatureGraph)
}

func (m MetricsConfig) OTelMetricsEnabled() bool {
return slices.Contains(m.Features, FeatureApplication)
}

func (m MetricsConfig) Enabled() bool {
return m.EndpointEnabled() && (m.OTelMetricsEnabled() || m.SpanMetricsEnabled())
return m.EndpointEnabled() && (m.OTelMetricsEnabled() || m.SpanMetricsEnabled() || m.ServiceGraphMetricsEnabled())
}

// MetricsReporter implements the graph node that receives request.Span
@@ -248,6 +253,17 @@ func (mr *MetricsReporter) spanMetricOptions(mlog *slog.Logger) []metric.Option

return []metric.Option{
metric.WithView(otelHistogramConfig(SpanMetricsLatency, mr.cfg.Buckets.DurationHistogram, useExponentialHistograms)),
}
}

func (mr *MetricsReporter) graphMetricOptions(mlog *slog.Logger) []metric.Option {
if !mr.cfg.ServiceGraphMetricsEnabled() {
return []metric.Option{}
}

useExponentialHistograms := isExponentialAggregation(mr.cfg, mlog)

return []metric.Option{
metric.WithView(otelHistogramConfig(ServiceGraphClient, mr.cfg.Buckets.DurationHistogram, useExponentialHistograms)),
metric.WithView(otelHistogramConfig(ServiceGraphServer, mr.cfg.Buckets.DurationHistogram, useExponentialHistograms)),
}
@@ -318,6 +334,16 @@ func (mr *MetricsReporter) setupSpanMeters(m *Metrics, meter instrument.Meter) e
return fmt.Errorf("creating span metric traces target info: %w", err)
}

return nil
}

func (mr *MetricsReporter) setupGraphMeters(m *Metrics, meter instrument.Meter) error {
if !mr.cfg.ServiceGraphMetricsEnabled() {
return nil
}

var err error

m.serviceGraphClient, err = meter.Float64Histogram(ServiceGraphClient, instrument.WithUnit("s"))
if err != nil {
return fmt.Errorf("creating service graph client histogram: %w", err)
@@ -354,6 +380,7 @@ func (mr *MetricsReporter) newMetricSet(service svc.ID) (*Metrics, error) {

opts = append(opts, mr.otelMetricOptions(mlog)...)
opts = append(opts, mr.spanMetricOptions(mlog)...)
opts = append(opts, mr.graphMetricOptions(mlog)...)

m := Metrics{
ctx: mr.ctx,
@@ -383,6 +410,15 @@ func (mr *MetricsReporter) newMetricSet(service svc.ID) (*Metrics, error) {
m.tracesTargetInfo.Add(mr.ctx, 1, attrOpt)
}

if mr.cfg.ServiceGraphMetricsEnabled() {
err = mr.setupGraphMeters(&m, meter)
if err != nil {
return nil, err
}
attrOpt := instrument.WithAttributeSet(mr.metricResourceAttributes(service))
m.tracesTargetInfo.Add(mr.ctx, 1, attrOpt)
}

return &m, nil
}

@@ -651,8 +687,10 @@ func (r *Metrics) record(span *request.Span, mr *MetricsReporter) {
r.spanMetricsLatency.Record(r.ctx, duration, attrOpt)
r.spanMetricsCallsTotal.Add(r.ctx, 1, attrOpt)
r.spanMetricsSizeTotal.Add(r.ctx, float64(span.ContentLength), attrOpt)
}

attrOpt = instrument.WithAttributeSet(mr.serviceGraphAttributes(span))
if mr.cfg.ServiceGraphMetricsEnabled() {
attrOpt := instrument.WithAttributeSet(mr.serviceGraphAttributes(span))
if span.IsClientSpan() {
r.serviceGraphClient.Record(r.ctx, duration, attrOpt)
} else {
27 changes: 19 additions & 8 deletions pkg/internal/export/prom/prom.go
Original file line number Diff line number Diff line change
@@ -137,9 +137,13 @@ func (p PrometheusConfig) OTelMetricsEnabled() bool {
return slices.Contains(p.Features, otel.FeatureApplication)
}

func (p PrometheusConfig) ServiceGraphMetricsEnabled() bool {
return slices.Contains(p.Features, otel.FeatureGraph)
}

// nolint:gocritic
func (p PrometheusConfig) Enabled() bool {
return (p.Port != 0 || p.Registry != nil) && (p.OTelMetricsEnabled() || p.SpanMetricsEnabled())
return (p.Port != 0 || p.Registry != nil) && (p.OTelMetricsEnabled() || p.SpanMetricsEnabled() || p.ServiceGraphMetricsEnabled())
}

type metricsReporter struct {
@@ -339,6 +343,11 @@ func newReporter(ctx context.Context, cfg *PrometheusConfig, ctxInfo *global.Con
mr.spanMetricsCallsTotal,
mr.spanMetricsSizeTotal,
mr.tracesTargetInfo,
)
}

if cfg.ServiceGraphMetricsEnabled() {
registeredMetrics = append(registeredMetrics,
mr.serviceGraphClient,
mr.serviceGraphServer,
mr.serviceGraphFailed,
@@ -396,6 +405,15 @@ func (r *metricsReporter) observe(span *request.Span) {
r.spanMetricsCallsTotal.WithLabelValues(lv...).Add(1)
r.spanMetricsSizeTotal.WithLabelValues(lv...).Add(float64(span.ContentLength))

_, ok := r.serviceCache.Get(span.ServiceID.UID)
if !ok {
r.serviceCache.Add(span.ServiceID.UID, span.ServiceID)
lv = r.labelValuesTargetInfo(span.ServiceID)
r.tracesTargetInfo.WithLabelValues(lv...).Add(1)
}
}

if r.cfg.ServiceGraphMetricsEnabled() {
lvg := r.labelValuesServiceGraph(span)
if span.IsClientSpan() {
r.serviceGraphClient.WithLabelValues(lvg...).Observe(duration)
@@ -406,13 +424,6 @@ func (r *metricsReporter) observe(span *request.Span) {
if otel.SpanStatusCode(span) == codes.Error {
r.serviceGraphFailed.WithLabelValues(lvg...).Add(1)
}

_, ok := r.serviceCache.Get(span.ServiceID.UID)
if !ok {
r.serviceCache.Add(span.ServiceID.UID, span.ServiceID)
lv = r.labelValuesTargetInfo(span.ServiceID)
r.tracesTargetInfo.WithLabelValues(lv...).Add(1)
}
}
}

Original file line number Diff line number Diff line change
@@ -115,5 +115,5 @@ spec:
- name: BEYLA_KUBE_METADATA_ENABLE
value: "autodetect"
- name: BEYLA_OTEL_METRIC_FEATURES
value: "application,application_span"
value: "application,application_span,application_service_graph"

Original file line number Diff line number Diff line change
@@ -125,7 +125,7 @@ spec:
- name: BEYLA_KUBE_METADATA_ENABLE
value: "autodetect"
- name: BEYLA_PROMETHEUS_FEATURES
value: "application,application_span"
value: "application,application_span,application_service_graph"
ports:
- containerPort: 8999
hostPort: 8999