grafana · grcevski · Apr 18, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 15, 2024
@@ -199,7 +199,7 @@ This section allows specifying different selection criteria for different servic
 as well as overriding some of their metadata, such as their reported name or
 namespace.
 
-For more details about this section, please go to the [discovery services section](#discovery-services-section)
+For more details about this section, go to the [discovery services section](#discovery-services-section)
 of this document.
 
 | YAML                       | Environment variable                          | Type    | Default |
@@ -447,8 +447,8 @@ a 'Traceparent' header value, it will use the provided 'trace id' to create its
 This option does not have an effect on Go applications, where the 'Traceparent' field is always
 processed, without additional tracking of the request headers.
 
-Enabling this option may increase Beyla's performance overhead in high request volume scenarios.
-Please note that this option is only useful when generating Beyla traces, it does not affect
+Enabling this option may increase the performance overhead in high request volume scenarios.
+This option is only useful when generating Beyla traces, it does not affect
 generation of Beyla metrics.
 
 ## Configuration of metrics and traces attributes
@@ -522,7 +522,7 @@ attributes:
 ```
 
 It is IMPORTANT to consider that enabling this feature requires a previous step of
-providing some extra permissions to the Beyla Pod. Please check the
+providing some extra permissions to the Beyla Pod. Consult the
 ["Configuring Kubernetes metadata decoration section" in the "Running Beyla in Kubernetes"]({{< relref "../setup/kubernetes.md" >}}) page.
 
 | YAML     | Environment variable                      | Type    | Default |
@@ -643,7 +643,7 @@ Possible values for the `ignore_mode` property are:
 
 Selectively ignoring only certain type of events might be useful in certain scenarios. For example, you may want to
 know the performance metrics of your health check API, but you wouldn't want the overhead of those trace records in
-your target traces database. In this this example scenario, you would set the `ignore_mode` property to `traces`, such
+your target traces database. In this example scenario, you would set the `ignore_mode` property to `traces`, such
 that only traces matching the `ignored_patterns` will be discarded, while metrics will still be recorded.
 
 | YAML        | Environment variable | Type   | Default    |
@@ -688,7 +688,7 @@ document/d/*/edit
 ## OTEL metrics exporter
 
 > ℹ️ If you plan to use Beyla to send metrics to Grafana Cloud,
-> please check the [Grafana Cloud OTEL exporter for metrics and traces](#using-the-grafana-cloud-otel-endpoint-to-ingest-metrics-and-traces)
+> consult the [Grafana Cloud OTEL exporter for metrics and traces](#using-the-grafana-cloud-otel-endpoint-to-ingest-metrics-and-traces)
 > section for easier configuration.
 
 YAML section `otel_metrics_export`.
@@ -785,7 +785,13 @@ of Beyla: application-level metrics or network metrics.
   process matching the entries in the `discovery` section.
 - If the list contains `application_span`, the Beyla OpenTelemetry exporter exports application-level trace span metrics;
   but only if there is defined an OpenTelemetry endpoint, and Beyla was able to discover any
-  process matching the entries in the `discovery` section.
+  process matching the entries in the `discovery` section. 
+- If the list contains `application_service_graph`, the Beyla OpenTelemetry exporter exports application-level service graph metrics;
+  but only if there is defined an OpenTelemetry endpoint, and Beyla was able to discover any
+  process matching the entries in the `discovery` section. 
+  For best experience with generating service graph metrics, use a DNS for service discovery and make sure the DNS names match
+  the OpenTelemetry service names used in Beyla. In Kubernetes environments, the OpenTelemetry service name set by the service name
+  discovery is the best choice for service graph metrics.
 - If the list contains `network`, the Beyla OpenTelemetry exporter exports network-level
   metrics; but only if there is defined an OpenTelemetry endpoint and the
   [network metrics are enabled]({{< relref "../network" >}}).
@@ -867,7 +873,7 @@ for more information.
 ## OTEL traces exporter
 
 > ℹ️ If you plan to use Beyla to send metrics to Grafana Cloud,
-> please check the [Grafana Cloud OTEL exporter for metrics and traces](#using-the-grafana-cloud-otel-endpoint-to-ingest-metrics-and-traces)
+> consult the [Grafana Cloud OTEL exporter for metrics and traces](#using-the-grafana-cloud-otel-endpoint-to-ingest-metrics-and-traces)
 > section for easier configuration.
 
 YAML section `otel_traces_export`.
@@ -1096,6 +1102,12 @@ of Beyla: application-level metrics or network metrics.
 - If the list contains `application_span`, the Beyla Prometheus exporter exports application-level metrics in traces span metrics format;
   but only if the Prometheus `port` property is defined, and Beyla was able to discover any
   process matching the entries in the `discovery` section.
+- If the list contains `application_service_graph`, the Beyla Prometheus exporter exports application-level service graph metrics;
+  but only if the Prometheus `port` property is defined, and Beyla was able to discover any
+  process matching the entries in the `discovery` section.
+  For best experience with generating service graph metrics, use a DNS for service discovery and make sure the DNS names match
+  the OpenTelemetry service names used in Beyla. In Kubernetes environments, the OpenTelemetry service name set by the service name
+  discovery is the best choice for service graph metrics.
 - If the list contains `network`, the Beyla Prometheus exporter exports network-level
   metrics; but only if the Prometheus `port` property is defined and the
   [network metrics are enabled]({{< relref "../network" >}}).

@@ -49,6 +49,10 @@ var DefaultConfig = Config{
 			Submit: []string{"traces"},
 		},
 	},
+	NameResolver: &transform.NameResolverConfig{
+		CacheLen: 1024,
+		CacheTTL: 5 * time.Minute,
+	},
 	Metrics: otel.MetricsConfig{
 		Protocol:             otel.ProtocolUnset,
 		MetricsProtocol:      otel.ProtocolUnset,
@@ -106,11 +110,12 @@ type Config struct {
 
 	Attributes Attributes `yaml:"attributes"`
 	// Routes is an optional node. If not set, data will be directly forwarded to exporters.
-	Routes     *transform.RoutesConfig `yaml:"routes"`
-	Metrics    otel.MetricsConfig      `yaml:"otel_metrics_export"`
-	Traces     otel.TracesConfig       `yaml:"otel_traces_export"`
-	Prometheus prom.PrometheusConfig   `yaml:"prometheus_export"`
-	Printer    debug.PrintEnabled      `yaml:"print_traces" env:"BEYLA_PRINT_TRACES"`
+	Routes       *transform.RoutesConfig       `yaml:"routes"`
+	NameResolver *transform.NameResolverConfig `yaml:"name_resolver"`
+	Metrics      otel.MetricsConfig            `yaml:"otel_metrics_export"`
+	Traces       otel.TracesConfig             `yaml:"otel_traces_export"`
+	Prometheus   prom.PrometheusConfig         `yaml:"prometheus_export"`
+	Printer      debug.PrintEnabled            `yaml:"print_traces" env:"BEYLA_PRINT_TRACES"`
 
 	// Exec allows selecting the instrumented executable whose complete path contains the Exec value.
 	Exec services.RegexpAttr `yaml:"executable_name" env:"BEYLA_EXECUTABLE_NAME"`

@@ -149,6 +149,10 @@ network:
 			},
 		},
 		Routes: &transform.RoutesConfig{},
+		NameResolver: &transform.NameResolverConfig{
+			CacheLen: 1024,
+			CacheTTL: 5 * time.Minute,
+		},
 	}, cfg)
 }
 

@@ -156,7 +156,7 @@ var defaultNetworkConfig = NetworkConfig{
 
 func (nc *NetworkConfig) Validate(isKubeEnabled bool) error {
 	if len(nc.AllowedAttributes) == 0 {
-		return errors.New("you must define some attributes in the allowed_attributes section. Please ceck documentation")
+		return errors.New("you must define some attributes in the allowed_attributes section. Please check documentation")
 	}
 	if isKubeEnabled {
 		return nil

@@ -38,8 +38,8 @@ func printFunc() (pipe.FinalFunc[[]request.Span], error) {
 					spans[i].Status,
 					spans[i].Method,
 					spans[i].Path,
-					spans[i].Peer,
-					spans[i].Host,
+					spans[i].Peer+" as "+spans[i].PeerName,
+					spans[i].Host+" as "+spans[i].HostName,
 					spans[i].HostPort,
 					spans[i].ContentLength,
 					&spans[i].ServiceID,

@@ -262,6 +262,11 @@ const (
 	SourceKey                 = attribute.Key("source")
 	ServiceKey                = attribute.Key("service")
 	InstanceKey               = attribute.Key("instance")
+	ClientKey                 = attribute.Key("client")
+	ClientNamespaceKey        = attribute.Key("client_service_namespace")
+	ServerKey                 = attribute.Key("server")
+	ServerNamespaceKey        = attribute.Key("server_service_namespace")
+	ConnectionTypeKey         = attribute.Key("connection_type")
 )
 
 func HTTPRequestMethod(val string) attribute.KeyValue {
@@ -327,3 +332,23 @@ func StatusCodeMetric(val int) attribute.KeyValue {
 func ServiceInstanceMetric(val string) attribute.KeyValue {
 	return InstanceKey.String(val)
 }
+
+func ClientMetric(val string) attribute.KeyValue {
+	return ClientKey.String(val)
+}
+
+func ClientNamespaceMetric(val string) attribute.KeyValue {
+	return ClientNamespaceKey.String(val)
+}
+
+func ServerMetric(val string) attribute.KeyValue {
+	return ServerKey.String(val)
+}
+
+func ServerNamespaceMetric(val string) attribute.KeyValue {
+	return ServerNamespaceKey.String(val)
+}
+
+func ConnectionTypeMetric(val string) attribute.KeyValue {
+	return ConnectionTypeKey.String(val)
+}
@@ -12,6 +12,7 @@ import (
 
 	"github.com/mariomac/pipes/pipe"
 	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
 	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
 	instrument "go.opentelemetry.io/otel/metric"
@@ -41,6 +42,10 @@ const (
 	SpanMetricsCalls      = "traces_spanmetrics_calls_total"
 	SpanMetricsSizes      = "traces_spanmetrics_size_total"
 	TracesTargetInfo      = "traces_target_info"
+	ServiceGraphClient    = "traces_service_graph_request_client"
+	ServiceGraphServer    = "traces_service_graph_request_server"
+	ServiceGraphFailed    = "traces_service_graph_request_failed_total"
+	ServiceGraphTotal     = "traces_service_graph_request_total"
 
 	UsualPortGRPC = "4317"
 	UsualPortHTTP = "4318"
@@ -51,6 +56,7 @@ const (
 	FeatureNetwork     = "network"
 	FeatureApplication = "application"
 	FeatureSpan        = "application_span"
+	FeatureGraph       = "application_service_graph"
 )
 
 type MetricsConfig struct {
@@ -132,12 +138,16 @@ func (m MetricsConfig) SpanMetricsEnabled() bool {
 	return slices.Contains(m.Features, FeatureSpan)
 }
 
+func (m MetricsConfig) ServiceGraphMetricsEnabled() bool {
+	return slices.Contains(m.Features, FeatureGraph)
+}
+
 func (m MetricsConfig) OTelMetricsEnabled() bool {
 	return slices.Contains(m.Features, FeatureApplication)
 }
 
 func (m MetricsConfig) Enabled() bool {
-	return m.EndpointEnabled() && (m.OTelMetricsEnabled() || m.SpanMetricsEnabled())
+	return m.EndpointEnabled() && (m.OTelMetricsEnabled() || m.SpanMetricsEnabled() || m.ServiceGraphMetricsEnabled())
 }
 
 // MetricsReporter implements the graph node that receives request.Span
@@ -167,6 +177,10 @@ type Metrics struct {
 	spanMetricsCallsTotal instrument.Int64Counter
 	spanMetricsSizeTotal  instrument.Float64Counter
 	tracesTargetInfo      instrument.Int64UpDownCounter
+	serviceGraphClient    instrument.Float64Histogram
+	serviceGraphServer    instrument.Float64Histogram
+	serviceGraphFailed    instrument.Int64Counter
+	serviceGraphTotal     instrument.Int64Counter
 }
 
 func ReportMetrics(
@@ -247,6 +261,19 @@ func (mr *MetricsReporter) spanMetricOptions(mlog *slog.Logger) []metric.Option
 	}
 }
 
+func (mr *MetricsReporter) graphMetricOptions(mlog *slog.Logger) []metric.Option {
+	if !mr.cfg.ServiceGraphMetricsEnabled() {
+		return []metric.Option{}
+	}
+
+	useExponentialHistograms := isExponentialAggregation(mr.cfg, mlog)
+
+	return []metric.Option{
+		metric.WithView(otelHistogramConfig(ServiceGraphClient, mr.cfg.Buckets.DurationHistogram, useExponentialHistograms)),
+		metric.WithView(otelHistogramConfig(ServiceGraphServer, mr.cfg.Buckets.DurationHistogram, useExponentialHistograms)),
+	}
+}
+
 func (mr *MetricsReporter) setupOtelMeters(m *Metrics, meter instrument.Meter) error {
 	if !mr.cfg.OTelMetricsEnabled() {
 		return nil
@@ -315,6 +342,36 @@ func (mr *MetricsReporter) setupSpanMeters(m *Metrics, meter instrument.Meter) e
 	return nil
 }
 
+func (mr *MetricsReporter) setupGraphMeters(m *Metrics, meter instrument.Meter) error {
+	if !mr.cfg.ServiceGraphMetricsEnabled() {
+		return nil
+	}
+
+	var err error
+
+	m.serviceGraphClient, err = meter.Float64Histogram(ServiceGraphClient, instrument.WithUnit("s"))
+	if err != nil {
+		return fmt.Errorf("creating service graph client histogram: %w", err)
+	}
+
+	m.serviceGraphServer, err = meter.Float64Histogram(ServiceGraphServer, instrument.WithUnit("s"))
+	if err != nil {
+		return fmt.Errorf("creating service graph server histogram: %w", err)
+	}
+
+	m.serviceGraphFailed, err = meter.Int64Counter(ServiceGraphFailed)
+	if err != nil {
+		return fmt.Errorf("creating service graph failed total: %w", err)
+	}
+
+	m.serviceGraphTotal, err = meter.Int64Counter(ServiceGraphTotal)
+	if err != nil {
+		return fmt.Errorf("creating service graph total: %w", err)
+	}
+
+	return nil
+}
+
 func (mr *MetricsReporter) newMetricSet(service svc.ID) (*Metrics, error) {
 	mlog := mlog().With("service", service)
 	mlog.Debug("creating new Metrics reporter")
@@ -328,6 +385,7 @@ func (mr *MetricsReporter) newMetricSet(service svc.ID) (*Metrics, error) {
 
 	opts = append(opts, mr.otelMetricOptions(mlog)...)
 	opts = append(opts, mr.spanMetricOptions(mlog)...)
+	opts = append(opts, mr.graphMetricOptions(mlog)...)
 
 	m := Metrics{
 		ctx:     mr.ctx,
@@ -357,6 +415,15 @@ func (mr *MetricsReporter) newMetricSet(service svc.ID) (*Metrics, error) {
 		m.tracesTargetInfo.Add(mr.ctx, 1, attrOpt)
 	}
 
+	if mr.cfg.ServiceGraphMetricsEnabled() {
+		err = mr.setupGraphMeters(&m, meter)
+		if err != nil {
+			return nil, err
+		}
+		attrOpt := instrument.WithAttributeSet(mr.metricResourceAttributes(service))
+		m.tracesTargetInfo.Add(mr.ctx, 1, attrOpt)
+	}
+
 	return &m, nil
 }
 
@@ -573,6 +640,30 @@ func (mr *MetricsReporter) spanMetricAttributes(span *request.Span) attribute.Se
 	return attribute.NewSet(attrs...)
 }
 
+func (mr *MetricsReporter) serviceGraphAttributes(span *request.Span) attribute.Set {
+	var attrs []attribute.KeyValue
+	if span.IsClientSpan() {
+		attrs = []attribute.KeyValue{
+			ClientMetric(span.PeerName),
+			ClientNamespaceMetric(span.ServiceID.Namespace),
+			ServerMetric(span.HostName),
+			ServerNamespaceMetric(span.OtherNamespace),
+			ConnectionTypeMetric("virtual_node"),
+			SourceMetric("beyla"),
+		}
+	} else {
+		attrs = []attribute.KeyValue{
+			ClientMetric(span.PeerName),
+			ClientNamespaceMetric(span.OtherNamespace),
+			ServerMetric(span.HostName),
+			ServerNamespaceMetric(span.ServiceID.Namespace),
+			ConnectionTypeMetric("virtual_node"),
+			SourceMetric("beyla"),
+		}
+	}
+	return attribute.NewSet(attrs...)
+}
+
 func (r *Metrics) record(span *request.Span, mr *MetricsReporter) {
 	t := span.Timings()
 	duration := t.End.Sub(t.RequestStart).Seconds()
@@ -602,6 +693,19 @@ func (r *Metrics) record(span *request.Span, mr *MetricsReporter) {
 		r.spanMetricsCallsTotal.Add(r.ctx, 1, attrOpt)
 		r.spanMetricsSizeTotal.Add(r.ctx, float64(span.ContentLength), attrOpt)
 	}
+
+	if mr.cfg.ServiceGraphMetricsEnabled() {
+		attrOpt := instrument.WithAttributeSet(mr.serviceGraphAttributes(span))
+		if span.IsClientSpan() {
+			r.serviceGraphClient.Record(r.ctx, duration, attrOpt)
+		} else {
+			r.serviceGraphServer.Record(r.ctx, duration, attrOpt)
+		}
+		r.serviceGraphTotal.Add(r.ctx, 1, attrOpt)
+		if SpanStatusCode(span) == codes.Error {
+			r.serviceGraphFailed.Add(r.ctx, 1, attrOpt)
+		}
+	}
 }
 
 func (mr *MetricsReporter) reportMetrics(input <-chan []request.Span) {