diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index d7886b6f367c..cea57e824ab1 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -6,6 +6,8 @@ package v1alpha1 import ( + "fmt" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -17,10 +19,12 @@ func DefaultEnvoyGateway() *EnvoyGateway { APIVersion: GroupVersion.String(), }, EnvoyGatewaySpec{ - Gateway: DefaultGateway(), - Provider: DefaultEnvoyGatewayProvider(), - Logging: DefaultEnvoyGatewayLogging(), - Admin: DefaultEnvoyGatewayAdmin(), + Gateway: DefaultGateway(), + Provider: DefaultEnvoyGatewayProvider(), + Logging: DefaultEnvoyGatewayLogging(), + Admin: DefaultEnvoyGatewayAdmin(), + Debug: DefaultEnvoyGatewayDebug(), + Telemetry: DefaultEnvoyGatewayTelemetry(), }, } } @@ -45,6 +49,12 @@ func (e *EnvoyGateway) SetEnvoyGatewayDefaults() { if e.Admin == nil { e.Admin = DefaultEnvoyGatewayAdmin() } + if e.Telemetry == nil { + e.Telemetry = DefaultEnvoyGatewayTelemetry() + } + if e.Debug == nil { + e.Debug = DefaultEnvoyGatewayDebug() + } } // GetEnvoyGatewayAdmin returns the EnvoyGatewayAdmin of EnvoyGateway or a default EnvoyGatewayAdmin if unspecified. @@ -60,6 +70,22 @@ func (e *EnvoyGateway) GetEnvoyGatewayAdmin() *EnvoyGatewayAdmin { return e.Admin } +// GetEnvoyGatewayDebug returns the EnvoyGatewayDebug of EnvoyGateway or a default EnvoyGatewayDebug if unspecified. +func (e *EnvoyGateway) GetEnvoyGatewayDebug() *EnvoyGatewayDebug { + if e.Debug != nil { + return e.Debug + } + e.Debug = DefaultEnvoyGatewayDebug() + + return e.Debug +} + +// GetEnvoyGatewayAdminAddress returns the EnvoyGateway Admin Address. +func (e *EnvoyGateway) GetEnvoyGatewayAdminAddress() string { + address := e.GetEnvoyGatewayAdmin().Address + return fmt.Sprintf("%s:%d", address.Host, address.Port) +} + // DefaultGateway returns a new Gateway with default configuration parameters. func DefaultGateway() *Gateway { return &Gateway{ @@ -76,6 +102,51 @@ func DefaultEnvoyGatewayLogging() *EnvoyGatewayLogging { } } +// GetEnvoyGatewayAdmin returns the EnvoyGatewayAdmin of EnvoyGateway or a default EnvoyGatewayAdmin if unspecified. +func (e *EnvoyGateway) GetEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { + if e.Telemetry != nil { + if e.Telemetry.Metrics.Prometheus == nil { + e.Telemetry.Metrics.Prometheus = DefaultEnvoyGatewayPrometheus() + } + if e.Telemetry.Metrics == nil { + e.Telemetry.Metrics = DefaultEnvoyGatewayMetrics() + } + return e.Telemetry + } + e.Telemetry = DefaultEnvoyGatewayTelemetry() + + return e.Telemetry +} + +func (e *EnvoyGateway) IfEnablePrometheus() bool { + return e.GetEnvoyGatewayTelemetry().Metrics.Prometheus.Enable +} + +// DefaultEnvoyGatewayTelemetry returns a new EnvoyGatewayTelemetry with default configuration parameters. +func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { + return &EnvoyGatewayTelemetry{ + Metrics: DefaultEnvoyGatewayMetrics(), + } +} + +// DefaultEnvoyGatewayMetrics returns a new EnvoyGatewayMetrics with default configuration parameters. +func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { + return &EnvoyGatewayMetrics{ + // Enable prometheus pull by default. + Prometheus: &EnvoyGatewayPrometheusProvider{ + Enable: true, + }, + } +} + +// DefaultEnvoyGatewayPrometheus returns a new EnvoyGatewayMetrics with default configuration parameters. +func DefaultEnvoyGatewayPrometheus() *EnvoyGatewayPrometheusProvider { + return &EnvoyGatewayPrometheusProvider{ + // Enable prometheus pull by default. + Enable: true, + } +} + // DefaultEnvoyGatewayProvider returns a new EnvoyGatewayProvider with default configuration parameters. func DefaultEnvoyGatewayProvider() *EnvoyGatewayProvider { return &EnvoyGatewayProvider{ @@ -103,11 +174,17 @@ func DefaultEnvoyGatewayKubeProvider() *EnvoyGatewayKubernetesProvider { // DefaultEnvoyGatewayAdmin returns a new EnvoyGatewayAdmin with default configuration parameters. func DefaultEnvoyGatewayAdmin() *EnvoyGatewayAdmin { return &EnvoyGatewayAdmin{ - Debug: false, Address: DefaultEnvoyGatewayAdminAddress(), } } +// DefaultEnvoyGatewayDebug returns a new EnvoyGatewayDebug with default configuration parameters. +func DefaultEnvoyGatewayDebug() *EnvoyGatewayDebug { + return &EnvoyGatewayDebug{ + DumpConfig: false, + } +} + // DefaultEnvoyGatewayAdminAddress returns a new EnvoyGatewayAdminAddress with default configuration parameters. func DefaultEnvoyGatewayAdminAddress() *EnvoyGatewayAdminAddress { return &EnvoyGatewayAdminAddress{ diff --git a/api/v1alpha1/envoygateway_metric_types.go b/api/v1alpha1/envoygateway_metric_types.go new file mode 100644 index 000000000000..8ec534e78540 --- /dev/null +++ b/api/v1alpha1/envoygateway_metric_types.go @@ -0,0 +1,43 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package v1alpha1 + +// EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +type EnvoyGatewayMetrics struct { + // Sinks defines the metric sinks where metrics are sent to. + Sinks []EnvoyGatewayMetricSink `json:"sinks,omitempty"` + // Prometheus defines the configuration for prometheus endpoint. + Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` +} + +// EnvoyGatewayMetricSink defines control plane +// metric sinks where metrics are sent to. +type EnvoyGatewayMetricSink struct { + // Type defines the metric sink type. + // EG control plane currently supports OpenTelemetry. + // +kubebuilder:validation:Enum=OpenTelemetry + // +kubebuilder:default=OpenTelemetry + Type MetricSinkType `json:"type"` + // Host define the sink service hostname. + Host string `json:"host"` + // Protocol define the sink service protocol. + Protocol string `json:"protocol"` + // Port defines the port the sink service is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=4317 + Port int32 `json:"port,omitempty"` +} + +// EnvoyGatewayPrometheusProvider will expose prometheus endpoint in pull mode. +type EnvoyGatewayPrometheusProvider struct { + // Enable defines if enables the prometheus metrics in pull mode. Default is true. + // + // +optional + // +kubebuilder:default=true + Enable bool `json:"enable,omitempty"` +} diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go index b96218f0746e..1d146db980e5 100644 --- a/api/v1alpha1/envoygateway_types.go +++ b/api/v1alpha1/envoygateway_types.go @@ -51,6 +51,12 @@ type EnvoyGatewaySpec struct { // +optional // +kubebuilder:default={default: info} Logging *EnvoyGatewayLogging `json:"logging,omitempty"` + + // Telemetry defines telemetry related configurations for envoy gateway. + // + // +optional + Telemetry *EnvoyGatewayTelemetry `json:"telemetry,omitempty"` + // Admin defines the desired admin related abilities. // If unspecified, the Admin is used with default configuration // parameters. @@ -58,6 +64,12 @@ type EnvoyGatewaySpec struct { // +optional Admin *EnvoyGatewayAdmin `json:"admin,omitempty"` + // Debug defines the desired debug related abilities. + // If unspecified, the debug will not be running, including pprof, dump config etc. + // + // +optional + Debug *EnvoyGatewayDebug `json:"debug,omitempty"` + // RateLimit defines the configuration associated with the Rate Limit service // deployed by Envoy Gateway required to implement the Global Rate limiting // functionality. The specific rate limit service used here is the reference @@ -79,6 +91,13 @@ type EnvoyGatewaySpec struct { ExtensionAPIs *ExtensionAPISettings `json:"extensionApis,omitempty"` } +// EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. +// Control plane will focus on metrics observability telemetry and tracing telemetry later. +type EnvoyGatewayTelemetry struct { + // Metrics defines metrics configuration for envoy gateway. + Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` +} + // EnvoyGatewayLogging defines logging for Envoy Gateway. type EnvoyGatewayLogging struct { // Level is the logging level. If unspecified, defaults to "info". @@ -424,11 +443,21 @@ type EnvoyGatewayAdmin struct { // // +optional Address *EnvoyGatewayAdminAddress `json:"address,omitempty"` +} + +// EnvoyGatewayDebug defines the Envoy Gateway Debug configuration. +type EnvoyGatewayDebug struct { + + // DumpConfig defines if dump the Envoy Gateway config in logs. + // + // +optional + DumpConfig bool `json:"dumpConfig,omitempty"` - // Debug defines if enable the /debug endpoint of Envoy Gateway. + // Address defines the address of Envoy Gateway Debug Server. + // Pprof will use the debug address, if you set it to non-nil. // // +optional - Debug bool `json:"debug,omitempty"` + Address *EnvoyGatewayDebugAddress `json:"address,omitempty"` } // EnvoyGatewayAdminAddress defines the Envoy Gateway Admin Address configuration. @@ -446,6 +475,21 @@ type EnvoyGatewayAdminAddress struct { Host string `json:"host,omitempty"` } +// EnvoyGatewayDebugAddress defines the Envoy Gateway Debug Address configuration. +type EnvoyGatewayDebugAddress struct { + // Port defines the port the debug server is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=19010 + Port int `json:"port,omitempty"` + // Host defines the debug server hostname. + // + // +optional + // +kubebuilder:default="127.0.0.1" + Host string `json:"host,omitempty"` +} + func init() { SchemeBuilder.Register(&EnvoyGateway{}) } diff --git a/api/v1alpha1/envoyproxy_types.go b/api/v1alpha1/envoyproxy_types.go index 575eb2b9e08e..00f4c38a7c43 100644 --- a/api/v1alpha1/envoyproxy_types.go +++ b/api/v1alpha1/envoyproxy_types.go @@ -129,44 +129,44 @@ type ProxyLogging struct { // and the log level is the value. If unspecified, defaults to "default: warn". // // +kubebuilder:default={default: warn} - Level map[LogComponent]LogLevel `json:"level,omitempty"` + Level map[ProxyLogComponent]LogLevel `json:"level,omitempty"` } -// LogComponent defines a component that supports a configured logging level. +// ProxyLogComponent defines a component that supports a configured logging level. // +kubebuilder:validation:Enum=system;upstream;http;connection;admin;client;filter;main;router;runtime -type LogComponent string +type ProxyLogComponent string const ( // LogComponentDefault defines the default logging component. // See more details: https://www.envoyproxy.io/docs/envoy/latest/operations/cli#cmdoption-l - LogComponentDefault LogComponent = "default" + LogComponentDefault ProxyLogComponent = "default" // LogComponentUpstream defines the "upstream" logging component. - LogComponentUpstream LogComponent = "upstream" + LogComponentUpstream ProxyLogComponent = "upstream" // LogComponentHTTP defines the "http" logging component. - LogComponentHTTP LogComponent = "http" + LogComponentHTTP ProxyLogComponent = "http" // LogComponentConnection defines the "connection" logging component. - LogComponentConnection LogComponent = "connection" + LogComponentConnection ProxyLogComponent = "connection" // LogComponentAdmin defines the "admin" logging component. - LogComponentAdmin LogComponent = "admin" + LogComponentAdmin ProxyLogComponent = "admin" // LogComponentClient defines the "client" logging component. - LogComponentClient LogComponent = "client" + LogComponentClient ProxyLogComponent = "client" // LogComponentFilter defines the "filter" logging component. - LogComponentFilter LogComponent = "filter" + LogComponentFilter ProxyLogComponent = "filter" // LogComponentMain defines the "main" logging component. - LogComponentMain LogComponent = "main" + LogComponentMain ProxyLogComponent = "main" // LogComponentRouter defines the "router" logging component. - LogComponentRouter LogComponent = "router" + LogComponentRouter ProxyLogComponent = "router" // LogComponentRuntime defines the "runtime" logging component. - LogComponentRuntime LogComponent = "runtime" + LogComponentRuntime ProxyLogComponent = "runtime" ) // ProxyBootstrap defines Envoy Bootstrap configuration. diff --git a/api/v1alpha1/metric_types.go b/api/v1alpha1/metric_types.go index 03efd6fa04fc..96214ec4f831 100644 --- a/api/v1alpha1/metric_types.go +++ b/api/v1alpha1/metric_types.go @@ -5,9 +5,15 @@ package v1alpha1 +type MetricSinkType string + +const ( + MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" +) + type ProxyMetrics struct { // Prometheus defines the configuration for Admin endpoint `/stats/prometheus`. - Prometheus *PrometheusProvider `json:"prometheus,omitempty"` + Prometheus *ProxyPrometheusProvider `json:"prometheus,omitempty"` // Sinks defines the metric sinks where metrics are sent to. Sinks []MetricSink `json:"sinks,omitempty"` // Matches defines configuration for selecting specific metrics instead of generating all metrics stats @@ -23,12 +29,6 @@ type ProxyMetrics struct { EnableVirtualHostStats bool `json:"enableVirtualHostStats,omitempty"` } -type MetricSinkType string - -const ( - MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" -) - type MetricSink struct { // Type defines the metric sink type. // EG currently only supports OpenTelemetry. @@ -71,5 +71,5 @@ type OpenTelemetrySink struct { // TODO: add support for customizing OpenTelemetry sink in https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/stat_sinks/open_telemetry/v3/open_telemetry.proto#envoy-v3-api-msg-extensions-stat-sinks-open-telemetry-v3-sinkconfig } -type PrometheusProvider struct { +type ProxyPrometheusProvider struct { } diff --git a/api/v1alpha1/validation/validate_test.go b/api/v1alpha1/validation/validate_test.go index 1a12cd54e094..5c4da91252a1 100644 --- a/api/v1alpha1/validation/validate_test.go +++ b/api/v1alpha1/validation/validate_test.go @@ -535,15 +535,12 @@ func TestEnvoyGatewayAdmin(t *testing.T) { // values should be set in default egAdmin := eg.GetEnvoyGatewayAdmin() assert.True(t, egAdmin != nil) - assert.True(t, egAdmin.Debug == false) assert.True(t, egAdmin.Address.Port == egv1a1.GatewayAdminPort) assert.True(t, egAdmin.Address.Host == egv1a1.GatewayAdminHost) // override the admin config // values should be updated - eg.Admin.Debug = true eg.Admin.Address = nil - assert.True(t, eg.Admin.Debug == true) assert.True(t, eg.GetEnvoyGatewayAdmin().Address.Port == egv1a1.GatewayAdminPort) assert.True(t, eg.GetEnvoyGatewayAdmin().Address.Host == egv1a1.GatewayAdminHost) @@ -552,7 +549,6 @@ func TestEnvoyGatewayAdmin(t *testing.T) { eg.Admin = nil eg.SetEnvoyGatewayDefaults() assert.True(t, eg.Admin != nil) - assert.True(t, eg.Admin.Debug == false) assert.True(t, eg.Admin.Address.Port == egv1a1.GatewayAdminPort) assert.True(t, eg.Admin.Address.Host == egv1a1.GatewayAdminHost) } @@ -560,12 +556,12 @@ func TestEnvoyGatewayAdmin(t *testing.T) { func TestGetEnvoyProxyDefaultComponentLevel(t *testing.T) { cases := []struct { logging egv1a1.ProxyLogging - component egv1a1.LogComponent + component egv1a1.ProxyLogComponent expected egv1a1.LogLevel }{ { logging: egv1a1.ProxyLogging{ - Level: map[egv1a1.LogComponent]egv1a1.LogLevel{ + Level: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelInfo, }, }, @@ -573,7 +569,7 @@ func TestGetEnvoyProxyDefaultComponentLevel(t *testing.T) { }, { logging: egv1a1.ProxyLogging{ - Level: map[egv1a1.LogComponent]egv1a1.LogLevel{ + Level: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelInfo, }, }, @@ -600,7 +596,7 @@ func TestGetEnvoyProxyComponentLevelArgs(t *testing.T) { }, { logging: egv1a1.ProxyLogging{ - Level: map[egv1a1.LogComponent]egv1a1.LogLevel{ + Level: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelInfo, }, }, @@ -608,7 +604,7 @@ func TestGetEnvoyProxyComponentLevelArgs(t *testing.T) { }, { logging: egv1a1.ProxyLogging{ - Level: map[egv1a1.LogComponent]egv1a1.LogLevel{ + Level: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelInfo, egv1a1.LogComponentAdmin: egv1a1.LogLevelWarn, }, @@ -617,7 +613,7 @@ func TestGetEnvoyProxyComponentLevelArgs(t *testing.T) { }, { logging: egv1a1.ProxyLogging{ - Level: map[egv1a1.LogComponent]egv1a1.LogLevel{ + Level: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelInfo, egv1a1.LogComponentAdmin: egv1a1.LogLevelWarn, egv1a1.LogComponentFilter: egv1a1.LogLevelDebug, @@ -627,7 +623,7 @@ func TestGetEnvoyProxyComponentLevelArgs(t *testing.T) { }, { logging: egv1a1.ProxyLogging{ - Level: map[egv1a1.LogComponent]egv1a1.LogLevel{ + Level: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelInfo, egv1a1.LogComponentAdmin: egv1a1.LogLevelWarn, egv1a1.LogComponentFilter: egv1a1.LogLevelDebug, diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index b0a1f4f18c06..714aa9e60879 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -342,6 +342,41 @@ func (in *EnvoyGatewayCustomProvider) DeepCopy() *EnvoyGatewayCustomProvider { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayDebug) DeepCopyInto(out *EnvoyGatewayDebug) { + *out = *in + if in.Address != nil { + in, out := &in.Address, &out.Address + *out = new(EnvoyGatewayDebugAddress) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayDebug. +func (in *EnvoyGatewayDebug) DeepCopy() *EnvoyGatewayDebug { + if in == nil { + return nil + } + out := new(EnvoyGatewayDebug) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayDebugAddress) DeepCopyInto(out *EnvoyGatewayDebugAddress) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayDebugAddress. +func (in *EnvoyGatewayDebugAddress) DeepCopy() *EnvoyGatewayDebugAddress { + if in == nil { + return nil + } + out := new(EnvoyGatewayDebugAddress) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyGatewayFileResourceProvider) DeepCopyInto(out *EnvoyGatewayFileResourceProvider) { *out = *in @@ -449,6 +484,61 @@ func (in *EnvoyGatewayLogging) DeepCopy() *EnvoyGatewayLogging { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayMetricSink) DeepCopyInto(out *EnvoyGatewayMetricSink) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayMetricSink. +func (in *EnvoyGatewayMetricSink) DeepCopy() *EnvoyGatewayMetricSink { + if in == nil { + return nil + } + out := new(EnvoyGatewayMetricSink) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayMetrics) DeepCopyInto(out *EnvoyGatewayMetrics) { + *out = *in + if in.Sinks != nil { + in, out := &in.Sinks, &out.Sinks + *out = make([]EnvoyGatewayMetricSink, len(*in)) + copy(*out, *in) + } + if in.Prometheus != nil { + in, out := &in.Prometheus, &out.Prometheus + *out = new(EnvoyGatewayPrometheusProvider) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayMetrics. +func (in *EnvoyGatewayMetrics) DeepCopy() *EnvoyGatewayMetrics { + if in == nil { + return nil + } + out := new(EnvoyGatewayMetrics) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayPrometheusProvider) DeepCopyInto(out *EnvoyGatewayPrometheusProvider) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayPrometheusProvider. +func (in *EnvoyGatewayPrometheusProvider) DeepCopy() *EnvoyGatewayPrometheusProvider { + if in == nil { + return nil + } + out := new(EnvoyGatewayPrometheusProvider) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyGatewayProvider) DeepCopyInto(out *EnvoyGatewayProvider) { *out = *in @@ -512,11 +602,21 @@ func (in *EnvoyGatewaySpec) DeepCopyInto(out *EnvoyGatewaySpec) { *out = new(EnvoyGatewayLogging) (*in).DeepCopyInto(*out) } + if in.Telemetry != nil { + in, out := &in.Telemetry, &out.Telemetry + *out = new(EnvoyGatewayTelemetry) + (*in).DeepCopyInto(*out) + } if in.Admin != nil { in, out := &in.Admin, &out.Admin *out = new(EnvoyGatewayAdmin) (*in).DeepCopyInto(*out) } + if in.Debug != nil { + in, out := &in.Debug, &out.Debug + *out = new(EnvoyGatewayDebug) + (*in).DeepCopyInto(*out) + } if in.RateLimit != nil { in, out := &in.RateLimit, &out.RateLimit *out = new(RateLimit) @@ -544,6 +644,26 @@ func (in *EnvoyGatewaySpec) DeepCopy() *EnvoyGatewaySpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayTelemetry) DeepCopyInto(out *EnvoyGatewayTelemetry) { + *out = *in + if in.Metrics != nil { + in, out := &in.Metrics, &out.Metrics + *out = new(EnvoyGatewayMetrics) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTelemetry. +func (in *EnvoyGatewayTelemetry) DeepCopy() *EnvoyGatewayTelemetry { + if in == nil { + return nil + } + out := new(EnvoyGatewayTelemetry) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyJSONPatchConfig) DeepCopyInto(out *EnvoyJSONPatchConfig) { *out = *in @@ -1351,21 +1471,6 @@ func (in *OpenTelemetrySink) DeepCopy() *OpenTelemetrySink { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PrometheusProvider) DeepCopyInto(out *PrometheusProvider) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PrometheusProvider. -func (in *PrometheusProvider) DeepCopy() *PrometheusProvider { - if in == nil { - return nil - } - out := new(PrometheusProvider) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProxyAccessLog) DeepCopyInto(out *ProxyAccessLog) { *out = *in @@ -1488,7 +1593,7 @@ func (in *ProxyLogging) DeepCopyInto(out *ProxyLogging) { *out = *in if in.Level != nil { in, out := &in.Level, &out.Level - *out = make(map[LogComponent]LogLevel, len(*in)) + *out = make(map[ProxyLogComponent]LogLevel, len(*in)) for key, val := range *in { (*out)[key] = val } @@ -1510,7 +1615,7 @@ func (in *ProxyMetrics) DeepCopyInto(out *ProxyMetrics) { *out = *in if in.Prometheus != nil { in, out := &in.Prometheus, &out.Prometheus - *out = new(PrometheusProvider) + *out = new(ProxyPrometheusProvider) **out = **in } if in.Sinks != nil { @@ -1537,6 +1642,21 @@ func (in *ProxyMetrics) DeepCopy() *ProxyMetrics { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProxyPrometheusProvider) DeepCopyInto(out *ProxyPrometheusProvider) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProxyPrometheusProvider. +func (in *ProxyPrometheusProvider) DeepCopy() *ProxyPrometheusProvider { + if in == nil { + return nil + } + out := new(ProxyPrometheusProvider) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProxyTelemetry) DeepCopyInto(out *ProxyTelemetry) { *out = *in diff --git a/go.mod b/go.mod index 06e35f26d43f..c421b93a0c2f 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,12 @@ require ( github.com/telepresenceio/watchable v0.0.0-20220726211108-9bb86f92afa7 github.com/tetratelabs/multierror v1.1.1 github.com/tsaarni/certyaml v0.9.2 + go.opentelemetry.io/otel v1.19.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.42.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.42.0 + go.opentelemetry.io/otel/exporters/prometheus v0.42.0 + go.opentelemetry.io/otel/metric v1.19.0 + go.opentelemetry.io/otel/sdk/metric v1.19.0 go.opentelemetry.io/proto/otlp v1.0.0 go.uber.org/zap v1.26.0 golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e @@ -41,7 +47,15 @@ require ( sigs.k8s.io/yaml v1.3.0 ) -require golang.org/x/sync v0.3.0 // indirect +require ( + github.com/cenkalti/backoff/v4 v4.2.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.42.0 // indirect + go.opentelemetry.io/otel/sdk v1.19.0 // indirect + go.opentelemetry.io/otel/trace v1.19.0 // indirect + golang.org/x/sync v0.3.0 // indirect +) require ( github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect @@ -86,7 +100,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.16.0 // indirect + github.com/prometheus/client_golang v1.16.0 github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/procfs v0.10.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect @@ -97,7 +111,7 @@ require ( go.uber.org/multierr v1.11.0 // indirect golang.org/x/net v0.14.0 // indirect golang.org/x/oauth2 v0.10.0 // indirect - golang.org/x/sys v0.11.0 // indirect + golang.org/x/sys v0.12.0 // indirect golang.org/x/term v0.11.0 // indirect golang.org/x/text v0.12.0 // indirect golang.org/x/time v0.3.0 // indirect diff --git a/go.sum b/go.sum index d29f63622275..589fd6968de1 100644 --- a/go.sum +++ b/go.sum @@ -37,6 +37,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= +github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= +github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMrBo8f1j86j5WHzznCCQxV/b8g= github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= @@ -125,8 +127,11 @@ github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v0.1.0/go.mod h1:tabnROwaDl0UNxkVeFRbY8bwB37GwRv0P8lg6aAiEnk= github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo= github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA= @@ -188,6 +193,7 @@ github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXP github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= @@ -253,6 +259,8 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= @@ -466,6 +474,24 @@ go.mongodb.org/mongo-driver v1.0.3/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qL go.mongodb.org/mongo-driver v1.1.1/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.mongodb.org/mongo-driver v1.1.2/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opentelemetry.io/otel v1.19.0 h1:MuS/TNf4/j4IXsZuJegVzI1cwut7Qc00344rgH7p8bs= +go.opentelemetry.io/otel v1.19.0/go.mod h1:i0QyjOq3UPoTzff0PJB2N66fb4S0+rSbSB15/oyH9fY= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.42.0 h1:ZtfnDL+tUrs1F0Pzfwbg2d59Gru9NCH3bgSHBM6LDwU= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.42.0/go.mod h1:hG4Fj/y8TR/tlEDREo8tWstl9fO9gcFkn4xrx0Io8xU= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.42.0 h1:NmnYCiR0qNufkldjVvyQfZTHSdzeHoZ41zggMsdMcLM= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.42.0/go.mod h1:UVAO61+umUsHLtYb8KXXRoHtxUkdOPkYidzW3gipRLQ= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.42.0 h1:wNMDy/LVGLj2h3p6zg4d0gypKfWKSWI14E1C4smOgl8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.42.0/go.mod h1:YfbDdXAAkemWJK3H/DshvlrxqFB2rtW4rY6ky/3x/H0= +go.opentelemetry.io/otel/exporters/prometheus v0.42.0 h1:jwV9iQdvp38fxXi8ZC+lNpxjK16MRcZlpDYvbuO1FiA= +go.opentelemetry.io/otel/exporters/prometheus v0.42.0/go.mod h1:f3bYiqNqhoPxkvI2LrXqQVC546K7BuRDL/kKuxkujhA= +go.opentelemetry.io/otel/metric v1.19.0 h1:aTzpGtV0ar9wlV4Sna9sdJyII5jTVJEvKETPiOKwvpE= +go.opentelemetry.io/otel/metric v1.19.0/go.mod h1:L5rUsV9kM1IxCj1MmSdS+JQAcVm319EUrDVLrt7jqt8= +go.opentelemetry.io/otel/sdk v1.19.0 h1:6USY6zH+L8uMH8L3t1enZPR3WFEmSTADlqldyHtJi3o= +go.opentelemetry.io/otel/sdk v1.19.0/go.mod h1:NedEbbS4w3C6zElbLdPJKOpJQOrGUJ+GfzpjUvI0v1A= +go.opentelemetry.io/otel/sdk/metric v1.19.0 h1:EJoTO5qysMsYCa+w4UghwFV/ptQgqSL/8Ni+hx+8i1k= +go.opentelemetry.io/otel/sdk/metric v1.19.0/go.mod h1:XjG0jQyFJrv2PbMvwND7LwCEhsJzCzV5210euduKcKY= +go.opentelemetry.io/otel/trace v1.19.0 h1:DFVQmlVbfVeOuBRrwdtaehRrWiL1JoVs9CPIQ1Dzxpg= +go.opentelemetry.io/otel/trace v1.19.0/go.mod h1:mfaSyvGyEJEI0nyV2I4qhNQnbBOUUmYZpYojqMnX2vo= go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I= go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM= go.starlark.net v0.0.0-20230525235612-a134d8f9ddca h1:VdD38733bfYv5tUZwEIskMM93VanwNIi5bIKnDrJdEY= @@ -575,8 +601,8 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20220526004731-065cf7ba2467/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.11.0 h1:F9tnn/DA/Im8nCwm+fX+1/eBwi4qFjRT++MhtVC4ZX0= diff --git a/internal/cmd/server.go b/internal/cmd/server.go index b6979023f09a..f62d508f7aeb 100644 --- a/internal/cmd/server.go +++ b/internal/cmd/server.go @@ -6,12 +6,6 @@ package cmd import ( - "fmt" - "net" - "net/http" - "net/http/pprof" - "time" - "github.com/davecgh/go-spew/spew" "github.com/spf13/cobra" @@ -57,7 +51,7 @@ func server() error { return err } - if cfg.EnvoyGateway.Admin.Debug { + if cfg.EnvoyGateway.GetEnvoyGatewayDebug().DumpConfig { spewConfig := spew.NewDefaultConfig() spewConfig.DisableMethods = true spewConfig.Dump(cfg) @@ -203,9 +197,6 @@ func setupRunners(cfg *config.Server) error { } } - // Start the admin server - go setupAdminServer(cfg) - // Wait until done <-ctx.Done() // Close messages @@ -223,33 +214,3 @@ func setupRunners(cfg *config.Server) error { return nil } - -func setupAdminServer(cfg *config.Server) { - adminHandlers := http.NewServeMux() - - address := cfg.EnvoyGateway.GetEnvoyGatewayAdmin().Address - - if cfg.EnvoyGateway.GetEnvoyGatewayAdmin().Debug { - // Serve pprof endpoints to aid in live debugging. - adminHandlers.HandleFunc("/debug/pprof/", pprof.Index) - adminHandlers.HandleFunc("/debug/pprof/profile", pprof.Profile) - adminHandlers.HandleFunc("/debug/pprof/trace", pprof.Trace) - adminHandlers.HandleFunc("/debug/pprof/symbol", pprof.Symbol) - adminHandlers.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) - } - - adminServer := &http.Server{ - Handler: adminHandlers, - Addr: net.JoinHostPort(address.Host, fmt.Sprint(address.Port)), - ReadTimeout: 5 * time.Second, - ReadHeaderTimeout: 5 * time.Second, - WriteTimeout: 10 * time.Second, - IdleTimeout: 15 * time.Second, - } - - // Listen And Serve Admin Server. - if err := adminServer.ListenAndServe(); err != nil { - cfg.Logger.Error(err, "start debug server failed") - } - -} diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index 1c2cef285514..aafbd70daf1f 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -49,8 +49,8 @@ func (r *Runner) Start(ctx context.Context) (err error) { } func (r *Runner) subscribeAndTranslate(ctx context.Context) { - message.HandleSubscription(r.ProviderResources.GatewayAPIResources.Subscribe(ctx), - func(update message.Update[string, *gatewayapi.Resources]) { + message.HandleSubscription(message.UpdateMetadata{Component: r.Name()}, r.ProviderResources.GatewayAPIResources.Subscribe(ctx), + func(update message.Update[string, *gatewayapi.Resources], errChans chan error) { r.Logger.Info("received an update") val := update.Value @@ -93,6 +93,7 @@ func (r *Runner) subscribeAndTranslate(ctx context.Context) { for key, val := range result.InfraIR { if err := val.Validate(); err != nil { r.Logger.Error(err, "unable to validate infra ir, skipped sending it") + errChans <- err } else { r.InfraIR.Store(key, val) newKeys = append(newKeys, key) @@ -102,6 +103,7 @@ func (r *Runner) subscribeAndTranslate(ctx context.Context) { for key, val := range result.XdsIR { if err := val.Validate(); err != nil { r.Logger.Error(err, "unable to validate xds ir, skipped sending it") + errChans <- err } else { r.XdsIR.Store(key, val) } @@ -152,7 +154,6 @@ func (r *Runner) subscribeAndTranslate(ctx context.Context) { key := utils.NamespacedName(clientTrafficPolicy) r.ProviderResources.ClientTrafficPolicyStatuses.Store(key, &clientTrafficPolicy.Status) } - }, ) r.Logger.Info("shutting down") diff --git a/internal/globalratelimit/runner/runner.go b/internal/globalratelimit/runner/runner.go index 8e64322965f0..650131651acf 100644 --- a/internal/globalratelimit/runner/runner.go +++ b/internal/globalratelimit/runner/runner.go @@ -111,19 +111,21 @@ func (r *Runner) serveXdsConfigServer(ctx context.Context) { func (r *Runner) subscribeAndTranslate(ctx context.Context) { // Subscribe to resources. - message.HandleSubscription(r.XdsIR.Subscribe(ctx), - func(update message.Update[string, *ir.Xds]) { + message.HandleSubscription(message.UpdateMetadata{Component: r.Name()}, r.XdsIR.Subscribe(ctx), + func(update message.Update[string, *ir.Xds], errChans chan error) { r.Logger.Info("received a notification") if update.Delete { if err := r.addNewSnapshot(ctx, nil); err != nil { r.Logger.Error(err, "failed to update the config snapshot") + errChans <- err } } else { // Translate to ratelimit xDS Config. rvt, err := r.translate(update.Value) if err != nil { r.Logger.Error(err, err.Error()) + errChans <- err } // Update ratelimit xDS config cache. diff --git a/internal/infrastructure/kubernetes/infra_client.go b/internal/infrastructure/kubernetes/infra_client.go index e9b8090b9706..8d9b4ab81d04 100644 --- a/internal/infrastructure/kubernetes/infra_client.go +++ b/internal/infrastructure/kubernetes/infra_client.go @@ -29,8 +29,17 @@ func (cli *InfraClient) CreateOrUpdate(ctx context.Context, key client.ObjectKey return retry.RetryOnConflict(retry.DefaultBackoff, func() error { if err := cli.Client.Get(ctx, key, current); err != nil { if kerrors.IsNotFound(err) { + infraManagerResourcesCreated.With( + k8sResourceTypeLabel.Value(specific.GetObjectKind().GroupVersionKind().Kind), + k8sResourceNameLabel.Value(key.Name), + k8sResourceNamespaceLabel.Value(key.Namespace)).Increment() // Create if it does not exist. if err := cli.Client.Create(ctx, specific); err != nil { + infraManagerResourcesErrors.With( + k8sResourceTypeLabel.Value(specific.GetObjectKind().GroupVersionKind().Kind), + operationLabel.Value("created"), + k8sResourceNameLabel.Value(key.Name), + k8sResourceNamespaceLabel.Value(key.Namespace)).Increment() return errors.Wrap(err, "for Create") } } @@ -39,7 +48,16 @@ func (cli *InfraClient) CreateOrUpdate(ctx context.Context, key client.ObjectKey // just perform an update for now. if updateChecker() { specific.SetUID(current.GetUID()) + infraManagerResourcesUpdated.With( + k8sResourceTypeLabel.Value(specific.GetObjectKind().GroupVersionKind().Kind), + k8sResourceNameLabel.Value(key.Name), + k8sResourceNamespaceLabel.Value(key.Namespace)).Increment() if err := cli.Client.Update(ctx, specific); err != nil { + infraManagerResourcesErrors.With( + k8sResourceTypeLabel.Value(specific.GetObjectKind().GroupVersionKind().Kind), + operationLabel.Value("updated"), + k8sResourceNameLabel.Value(key.Name), + k8sResourceNamespaceLabel.Value(key.Namespace)).Increment() return errors.Wrap(err, "for Update") } } @@ -50,10 +68,19 @@ func (cli *InfraClient) CreateOrUpdate(ctx context.Context, key client.ObjectKey } func (cli *InfraClient) Delete(ctx context.Context, object client.Object) error { + infraManagerResourcesDeleted.With( + k8sResourceTypeLabel.Value(object.GetObjectKind().GroupVersionKind().Kind), + k8sResourceNameLabel.Value(object.GetName()), + k8sResourceNamespaceLabel.Value(object.GetNamespace())).Increment() if err := cli.Client.Delete(ctx, object); err != nil { if kerrors.IsNotFound(err) { return nil } + infraManagerResourcesErrors.With( + k8sResourceTypeLabel.Value(object.GetObjectKind().GroupVersionKind().Kind), + operationLabel.Value("deleted"), + k8sResourceNameLabel.Value(object.GetName()), + k8sResourceNamespaceLabel.Value(object.GetNamespace())).Increment() return err } diff --git a/internal/infrastructure/kubernetes/infra_resource.go b/internal/infrastructure/kubernetes/infra_resource.go index af041ad33135..eec62e76c195 100644 --- a/internal/infrastructure/kubernetes/infra_resource.go +++ b/internal/infrastructure/kubernetes/infra_resource.go @@ -105,6 +105,9 @@ func (i *Infra) deleteServiceAccount(ctx context.Context, r ResourceRender) erro Namespace: i.Namespace, Name: r.Name(), }, + TypeMeta: metav1.TypeMeta{ + Kind: "ServiceAccount", + }, } return i.Client.Delete(ctx, sa) @@ -117,6 +120,9 @@ func (i *Infra) deleteDeployment(ctx context.Context, r ResourceRender) error { Namespace: i.Namespace, Name: r.Name(), }, + TypeMeta: metav1.TypeMeta{ + Kind: "Deployment", + }, } return i.Client.Delete(ctx, deployment) @@ -129,6 +135,9 @@ func (i *Infra) deleteConfigMap(ctx context.Context, r ResourceRender) error { Namespace: i.Namespace, Name: r.Name(), }, + TypeMeta: metav1.TypeMeta{ + Kind: "ConfigMap", + }, } return i.Client.Delete(ctx, cm) @@ -141,6 +150,9 @@ func (i *Infra) deleteService(ctx context.Context, r ResourceRender) error { Namespace: i.Namespace, Name: r.Name(), }, + TypeMeta: metav1.TypeMeta{ + Kind: "Service", + }, } return i.Client.Delete(ctx, svc) diff --git a/internal/infrastructure/kubernetes/metrics.go b/internal/infrastructure/kubernetes/metrics.go new file mode 100644 index 000000000000..403a1fcd02f2 --- /dev/null +++ b/internal/infrastructure/kubernetes/metrics.go @@ -0,0 +1,38 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package kubernetes + +import ( + "github.com/envoyproxy/gateway/internal/metrics" +) + +var ( + infraManagerResourcesCreated = metrics.NewCounter( + "infra_manager_resources_created_total", + "Total number of the resources created by infra manager.", + ) + + infraManagerResourcesUpdated = metrics.NewCounter( + "infra_manager_resources_updated_total", + "Total number of the resources updated by infra manager.", + ) + + infraManagerResourcesDeleted = metrics.NewCounter( + "infra_manager_resources_deleted_total", + "Total number of the resources deleted by infra manager.", + ) + + infraManagerResourcesErrors = metrics.NewCounter( + "infra_manager_resources_errors_total", + "Total number of the resources errors encountered by infra manager.", + ) + + // metrics label definitions + operationLabel = metrics.NewLabel("operation") + k8sResourceTypeLabel = metrics.NewLabel("k8s_resource_type") + k8sResourceNamespaceLabel = metrics.NewLabel("k8s_resource_namespace") + k8sResourceNameLabel = metrics.NewLabel("k8s_resource_name") +) diff --git a/internal/infrastructure/kubernetes/proxy/resource_provider_test.go b/internal/infrastructure/kubernetes/proxy/resource_provider_test.go index 50ce69946e9c..b3894d475ca0 100644 --- a/internal/infrastructure/kubernetes/proxy/resource_provider_test.go +++ b/internal/infrastructure/kubernetes/proxy/resource_provider_test.go @@ -65,7 +65,7 @@ func TestDeployment(t *testing.T) { caseName string infra *ir.Infra deploy *egv1a1.KubernetesDeploymentSpec - proxyLogging map[egv1a1.LogComponent]egv1a1.LogLevel + proxyLogging map[egv1a1.ProxyLogComponent]egv1a1.LogLevel bootstrap string telemetry *egv1a1.ProxyTelemetry concurrency *int32 @@ -248,7 +248,7 @@ func TestDeployment(t *testing.T) { caseName: "component-level", infra: newTestInfra(), deploy: nil, - proxyLogging: map[egv1a1.LogComponent]egv1a1.LogLevel{ + proxyLogging: map[egv1a1.ProxyLogComponent]egv1a1.LogLevel{ egv1a1.LogComponentDefault: egv1a1.LogLevelError, egv1a1.LogComponentFilter: egv1a1.LogLevelInfo, }, @@ -259,7 +259,7 @@ func TestDeployment(t *testing.T) { infra: newTestInfra(), telemetry: &egv1a1.ProxyTelemetry{ Metrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{}, + Prometheus: &egv1a1.ProxyPrometheusProvider{}, }, }, }, diff --git a/internal/infrastructure/runner/runner.go b/internal/infrastructure/runner/runner.go index 98859a2e88bd..36efde47653f 100644 --- a/internal/infrastructure/runner/runner.go +++ b/internal/infrastructure/runner/runner.go @@ -54,19 +54,21 @@ func (r *Runner) Start(ctx context.Context) (err error) { func (r *Runner) subscribeToProxyInfraIR(ctx context.Context) { // Subscribe to resources - message.HandleSubscription(r.InfraIR.Subscribe(ctx), - func(update message.Update[string, *ir.Infra]) { + message.HandleSubscription(message.UpdateMetadata{Component: r.Name()}, r.InfraIR.Subscribe(ctx), + func(update message.Update[string, *ir.Infra], errChans chan error) { r.Logger.Info("received an update") val := update.Value if update.Delete { if err := r.mgr.DeleteProxyInfra(ctx, val); err != nil { r.Logger.Error(err, "failed to delete infra") + errChans <- err } } else { // Manage the proxy infra. if err := r.mgr.CreateOrUpdateProxyInfra(ctx, val); err != nil { r.Logger.Error(err, "failed to create new infra") + errChans <- err } } }, diff --git a/internal/message/metrics.go b/internal/message/metrics.go new file mode 100644 index 000000000000..83ce102a19d8 --- /dev/null +++ b/internal/message/metrics.go @@ -0,0 +1,38 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package message + +import "github.com/envoyproxy/gateway/internal/metrics" + +var ( + // metrics definitions + watchableHandleUpdates = metrics.NewCounter( + "watchable_queue_handle_updates_total", + "Total number of updates handled by watchable queue.", + ) + + watchableHandleUpdateErrors = metrics.NewCounter( + "watchable_queue_handle_updates_errors_total", + "Total number of update errors handled by watchable queue.", + ) + + watchableDepth = metrics.NewGauge( + "watchable_queue_depth", + "Current depth of watchable message queue.", + ) + + watchableHandleUpdateTimeSeconds = metrics.NewHistogram( + "watchable_queue_handle_update_time_seconds", + "How long in seconds a update handled by watchable queue.", + []float64{0.001, 0.01, 0.1, 1, 5, 10}, + ) + + // metrics label definitions + // component is which component the update belong to. + componentNameLabel = metrics.NewLabel("component_name") + // resource is which resource the update belong to. + resourceTypeLabel = metrics.NewLabel("resource_type") +) diff --git a/internal/message/watchutil.go b/internal/message/watchutil.go index 29cecc37a9fb..3ed700d9eecd 100644 --- a/internal/message/watchutil.go +++ b/internal/message/watchutil.go @@ -6,11 +6,36 @@ package message import ( + "time" + "github.com/telepresenceio/watchable" + + "github.com/envoyproxy/gateway/api/v1alpha1" + "github.com/envoyproxy/gateway/internal/logging" + "github.com/envoyproxy/gateway/internal/metrics" ) type Update[K comparable, V any] watchable.Update[K, V] +var logger = logging.DefaultLogger(v1alpha1.LogLevelInfo).WithName("watchable") + +type UpdateMetadata struct { + Component string + Resource string +} + +func (m UpdateMetadata) LabelValues() []metrics.LabelValue { + labels := []metrics.LabelValue{} + if m.Component != "" { + labels = append(labels, componentNameLabel.Value(m.Component)) + } + if m.Resource != "" { + labels = append(labels, resourceTypeLabel.Value(m.Resource)) + } + + return labels +} + // HandleSubscription takes a channel returned by // watchable.Map.Subscribe() (or .SubscribeSubset()), and calls the // given function for each initial value in the map, and for any @@ -20,20 +45,33 @@ type Update[K comparable, V any] watchable.Update[K, V] // it handles the case where the watchable.Map already contains // entries before .Subscribe is called. func HandleSubscription[K comparable, V any]( + meta UpdateMetadata, subscription <-chan watchable.Snapshot[K, V], - handle func(Update[K, V]), + handle func(updateFunc Update[K, V], errChans chan error), ) { + errChans := make(chan error, 10) + go func() { + for err := range errChans { + logger.WithValues("component", meta.Component).Error(err, "observed an error") + watchableHandleUpdateErrors.With(meta.LabelValues()...).Increment() + } + }() + if snapshot, ok := <-subscription; ok { for k, v := range snapshot.State { - handle(Update[K, V]{ - Key: k, - Value: v, - }) + startHandleTime := time.Now() + handle(Update[K, V]{Key: k, Value: v}, errChans) + watchableHandleUpdates.With(meta.LabelValues()...).Increment() + watchableHandleUpdateTimeSeconds.With(meta.LabelValues()...).Record(time.Since(startHandleTime).Seconds()) } } for snapshot := range subscription { + watchableDepth.With(meta.LabelValues()...).RecordInt(int64(len(subscription))) for _, update := range snapshot.Updates { - handle(Update[K, V](update)) + startHandleTime := time.Now() + handle(Update[K, V](update), errChans) + watchableHandleUpdates.With(meta.LabelValues()...).Increment() + watchableHandleUpdateTimeSeconds.With(meta.LabelValues()...).Record(time.Since(startHandleTime).Seconds()) } } } diff --git a/internal/message/watchutil_test.go b/internal/message/watchutil_test.go index 39674ed7eec8..e3e1a98e9745 100644 --- a/internal/message/watchutil_test.go +++ b/internal/message/watchutil_test.go @@ -23,8 +23,9 @@ func TestHandleSubscriptionAlreadyClosed(t *testing.T) { var calls int message.HandleSubscription[string, any]( + message.UpdateMetadata{Component: "test-component"}, ch, - func(message.Update[string, any]) { calls++ }, + func(update message.Update[string, any], errChans chan error) { calls++ }, ) assert.Equal(t, 0, calls) } @@ -47,8 +48,9 @@ func TestHandleSubscriptionAlreadyInitialized(t *testing.T) { var storeCalls int var deleteCalls int message.HandleSubscription[string, any]( + message.UpdateMetadata{Component: "test-component"}, m.Subscribe(context.Background()), - func(update message.Update[string, any]) { + func(update message.Update[string, any], errChans chan error) { end() if update.Delete { deleteCalls++ @@ -121,7 +123,7 @@ func TestXdsIRUpdates(t *testing.T) { }() updates := 0 - message.HandleSubscription(snapshotC, func(u message.Update[string, *ir.Xds]) { + message.HandleSubscription(message.UpdateMetadata{Component: "test-component"}, snapshotC, func(u message.Update[string, *ir.Xds], errChans chan error) { end() if u.Key == "test" { updates += 1 diff --git a/internal/metrics/definition.go b/internal/metrics/definition.go new file mode 100644 index 000000000000..1d6ab0e39ef7 --- /dev/null +++ b/internal/metrics/definition.go @@ -0,0 +1,146 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import ( + "errors" + "sync" + + "go.opentelemetry.io/otel" + api "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/sdk/metric" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" + + "github.com/envoyproxy/gateway/api/v1alpha1" + log "github.com/envoyproxy/gateway/internal/logging" +) + +var ( + meter = func() api.Meter { + return otel.GetMeterProvider().Meter("envoy-gateway") + } + + monitoringLogger = log.DefaultLogger(v1alpha1.LogLevelInfo).WithName("metrics") +) + +func init() { + otel.SetLogger(monitoringLogger.Logger) +} + +// MetricType is the type of a metric. +type MetricType string + +// Metric type supports: +// * Counter: A Counter is a simple metric that only goes up (increments). +// +// * Gauge: A Gauge is a metric that represent +// a single numerical value that can arbitrarily go up and down. +// +// * Histogram: A Histogram samples observations and counts them in configurable buckets. +// It also provides a sum of all observed values. +// It's used to visualize the statistical distribution of these observations. + +const ( + CounterType MetricType = "Counter" + GaugeType MetricType = "Gauge" + HistogramType MetricType = "Histogram" +) + +// A Metric collects numerical observations. +type Metric interface { + // Name returns the name value of a Metric. + Name() string + + // Record makes an observation of the provided value for the given measure. + Record(value float64) + + // RecordInt makes an observation of the provided value for the measure. + RecordInt(value int64) + + // Increment records a value of 1 for the current measure. + // For Counters, this is equivalent to adding 1 to the current value. + // For Gauges, this is equivalent to setting the value to 1. + // For Histograms, this is equivalent to making an observation of value 1. + Increment() + + // Decrement records a value of -1 for the current measure. + // For Counters, this is equivalent to subtracting -1 to the current value. + // For Gauges, this is equivalent to setting the value to -1. + // For Histograms, this is equivalent to making an observation of value -1. + Decrement() + + // With creates a new Metric, with the LabelValues provided. + // This allows creating a set of pre-dimensioned data for recording purposes. + // This is primarily used for documentation and convenience. + // Metrics created with this method do not need to be registered (they share the registration of their parent Metric). + With(labelValues ...LabelValue) Metric +} + +// ExportAllDefinitions reports all currently registered metric definitions. +func ExportAllDefinitions() []Definition { + defs.mu.Lock() + defer defs.mu.Unlock() + + defs := maps.Values(defs.known) + slices.SortFunc(defs, func(a, b Definition) bool { + return a.Name < b.Name + }) + return defs +} + +// Definition records a metric's metadata. +type Definition struct { + Name string + Type MetricType + Description string + Bounds []float64 +} + +// metrics stores known metrics +type def struct { + started bool + mu sync.Mutex + known map[string]Definition +} + +// defs is a global that stores all registered metrics +var defs = def{ + known: map[string]Definition{}, +} + +// register records a newly defined metric. Only valid before an exporter is set. +func (d *def) register(def Definition) { + d.mu.Lock() + defer d.mu.Unlock() + if d.started { + monitoringLogger.Error(errors.New("cannot initialize metric after metric has started"), "metric", def.Name) + } + d.known[def.Name] = def +} + +// preAddOptions runs pre-run steps before adding to meter provider. +func (d *def) preAddOptions() []metric.Option { + d.mu.Lock() + defer d.mu.Unlock() + d.started = true + opts := []metric.Option{} + for name, def := range d.known { + if def.Bounds == nil { + continue + } + // for each histogram metric (i.e. those with bounds), set up a view explicitly defining those buckets. + v := metric.WithView(metric.NewView( + metric.Instrument{Name: name}, + metric.Stream{ + Aggregation: metric.AggregationExplicitBucketHistogram{ + Boundaries: def.Bounds, + }}, + )) + opts = append(opts, v) + } + return opts +} diff --git a/internal/metrics/doc.go b/internal/metrics/doc.go new file mode 100644 index 000000000000..b601b082be9c --- /dev/null +++ b/internal/metrics/doc.go @@ -0,0 +1,6 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics diff --git a/internal/metrics/label.go b/internal/metrics/label.go new file mode 100644 index 000000000000..a3b3c4b03eaf --- /dev/null +++ b/internal/metrics/label.go @@ -0,0 +1,37 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import "go.opentelemetry.io/otel/attribute" + +// NewLabel will attempt to create a new Label. +func NewLabel(key string) Label { + return Label{attribute.Key(key)} +} + +// A Label provides a named dimension for a Metric. +type Label struct { + key attribute.Key +} + +// Value creates a new LabelValue for the Label. +func (l Label) Value(value string) LabelValue { + return LabelValue{l.key.String(value)} +} + +// A LabelValue represents a Label with a specific value. It is used to record +// values for a Metric. +type LabelValue struct { + keyValue attribute.KeyValue +} + +func (l LabelValue) Key() Label { + return Label{l.keyValue.Key} +} + +func (l LabelValue) Value() string { + return l.keyValue.Value.AsString() +} diff --git a/internal/metrics/metric.go b/internal/metrics/metric.go new file mode 100644 index 000000000000..9b953a34e0dc --- /dev/null +++ b/internal/metrics/metric.go @@ -0,0 +1,73 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import ( + "go.opentelemetry.io/otel/attribute" +) + +// base metric implementation. +type base struct { + name string + attrs []attribute.KeyValue + m Metric +} + +func (f base) Name() string { + return f.name +} + +func (f base) Increment() { + f.m.Record(1) +} + +func (f base) Decrement() { + f.m.Record(-1) +} + +func (f base) RecordInt(value int64) { + f.m.Record(float64(value)) +} + +// disabled metric implementation. +type disabled struct { + name string +} + +// Decrement implements Metric +func (dm *disabled) Decrement() {} + +// Increment implements Metric +func (dm *disabled) Increment() {} + +// Name implements Metric +func (dm *disabled) Name() string { + return dm.name +} + +// Record implements Metric +func (dm *disabled) Record(value float64) {} + +// RecordInt implements Metric +func (dm *disabled) RecordInt(value int64) {} + +// With implements Metric +func (dm *disabled) With(labelValues ...LabelValue) Metric { + return dm +} + +var _ Metric = &disabled{} + +func mergeAttributes(bm base, labelValues []LabelValue) ([]attribute.KeyValue, attribute.Set) { + attrs := make([]attribute.KeyValue, 0, len(bm.attrs)+len(labelValues)) + attrs = append(attrs, bm.attrs...) + for _, v := range labelValues { + attrs = append(attrs, v.keyValue) + } + + set := attribute.NewSet(attrs...) + return attrs, set +} diff --git a/internal/metrics/metric_counter.go b/internal/metrics/metric_counter.go new file mode 100644 index 000000000000..5a9f18a845bc --- /dev/null +++ b/internal/metrics/metric_counter.go @@ -0,0 +1,73 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import ( + "context" + + api "go.opentelemetry.io/otel/metric" +) + +var _ Metric = &counter{} + +// NewCounter creates a new Counter Metric (the values will be cumulative). +// That means that data collected by the new Metric will be summed before export. +func NewCounter(name, description string, opts ...Options) Metric { + defs.register(Definition{ + Name: name, + Type: CounterType, + Description: description, + }) + o, dm := metricOptions(name, description, opts...) + if dm != nil { + return dm + } + return newCounter(o) +} + +type counter struct { + base + c api.Float64Counter + // preRecordOptions is just a precomputation to avoid allocations on each record call + preRecordOptions []api.AddOption +} + +func (f *counter) Record(value float64) { + if f.preRecordOptions != nil { + f.c.Add(context.Background(), value, f.preRecordOptions...) + } else { + f.c.Add(context.Background(), value) + } +} + +func (f *counter) With(labelValues ...LabelValue) Metric { + attrs, set := mergeAttributes(f.base, labelValues) + m := &counter{ + c: f.c, + preRecordOptions: []api.AddOption{api.WithAttributeSet(set)}, + } + m.base = base{ + name: f.name, + attrs: attrs, + m: m, + } + return m +} + +func newCounter(o options) *counter { + c, err := meter().Float64Counter(o.name, + api.WithDescription(o.description), + api.WithUnit(string(o.unit))) + if err != nil { + monitoringLogger.Error(err, "failed to create counter") + } + m := &counter{c: c} + m.base = base{ + name: o.name, + m: m, + } + return m +} diff --git a/internal/metrics/metric_gauge.go b/internal/metrics/metric_gauge.go new file mode 100644 index 000000000000..5c39aaa70021 --- /dev/null +++ b/internal/metrics/metric_gauge.go @@ -0,0 +1,106 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import ( + "context" + "sync" + + "go.opentelemetry.io/otel/attribute" + api "go.opentelemetry.io/otel/metric" +) + +var _ Metric = &gauge{} + +// NewGauge creates a new Gauge Metric. That means that data collected by the new +// Metric will export only the last recorded value. +func NewGauge(name, description string, opts ...Options) Metric { + defs.register(Definition{ + Name: name, + Type: GaugeType, + Description: description, + }) + o, dm := metricOptions(name, description, opts...) + if dm != nil { + return dm + } + + return newGauge(o) +} + +type gauge struct { + base + g api.Float64ObservableGauge + + mutex *sync.RWMutex + + // labelSets stores a map of attributes -> values, for gauges. + labelSets map[attribute.Set]*gaugeValues + current *gaugeValues +} + +type gaugeValues struct { + val float64 + opt []api.ObserveOption +} + +func (f *gauge) Record(value float64) { + f.mutex.Lock() + if f.current == nil { + f.current = &gaugeValues{} + f.labelSets[attribute.NewSet()] = f.current + } + f.current.val = value + f.mutex.Unlock() +} + +func (f *gauge) With(labelValues ...LabelValue) Metric { + attrs, set := mergeAttributes(f.base, labelValues) + m := &gauge{ + g: f.g, + mutex: f.mutex, + labelSets: f.labelSets, + } + if _, f := m.labelSets[set]; !f { + m.labelSets[set] = &gaugeValues{ + opt: []api.ObserveOption{api.WithAttributeSet(set)}, + } + } + m.current = m.labelSets[set] + m.base = base{ + name: f.name, + attrs: attrs, + m: m, + } + return m +} + +func newGauge(o options) *gauge { + r := &gauge{ + mutex: &sync.RWMutex{}, + } + r.labelSets = map[attribute.Set]*gaugeValues{} + g, err := meter().Float64ObservableGauge(o.name, + api.WithFloat64Callback(func(ctx context.Context, observer api.Float64Observer) error { + r.mutex.Lock() + defer r.mutex.Unlock() + for _, gv := range r.labelSets { + observer.Observe(gv.val, gv.opt...) + } + return nil + }), + api.WithDescription(o.description), + api.WithUnit(string(o.unit))) + if err != nil { + monitoringLogger.Error(err, "failed to create gauge") + } + r.g = g + r.base = base{ + name: o.name, + m: r, + } + return r +} diff --git a/internal/metrics/metric_histogram.go b/internal/metrics/metric_histogram.go new file mode 100644 index 000000000000..c7ce7670401d --- /dev/null +++ b/internal/metrics/metric_histogram.go @@ -0,0 +1,74 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import ( + "context" + + api "go.opentelemetry.io/otel/metric" +) + +var _ Metric = &histogram{} + +// NewHistogram creates a new Metric with an aggregation type of Histogram. +// This means that the data collected by the Metric will be collected and exported as a histogram, with the specified bounds. +func NewHistogram(name, description string, bounds []float64, opts ...Options) Metric { + defs.register(Definition{ + Name: name, + Type: HistogramType, + Description: description, + Bounds: bounds, + }) + o, dm := metricOptions(name, description, opts...) + if dm != nil { + return dm + } + return newHistogram(o) +} + +type histogram struct { + base + d api.Float64Histogram + // preRecordOptions is just a precomputation to avoid allocations on each record call + preRecordOptions []api.RecordOption +} + +func (f *histogram) Record(value float64) { + if f.preRecordOptions != nil { + f.d.Record(context.Background(), value, f.preRecordOptions...) + } else { + f.d.Record(context.Background(), value) + } +} + +func (f *histogram) With(labelValues ...LabelValue) Metric { + attrs, set := mergeAttributes(f.base, labelValues) + m := &histogram{ + d: f.d, + preRecordOptions: []api.RecordOption{api.WithAttributeSet(set)}, + } + m.base = base{ + name: f.name, + attrs: attrs, + m: m, + } + return m +} + +func newHistogram(o options) *histogram { + d, err := meter().Float64Histogram(o.name, + api.WithDescription(o.description), + api.WithUnit(string(o.unit))) + if err != nil { + monitoringLogger.Error(err, "failed to create histogram") + } + m := &histogram{d: d} + m.base = base{ + name: o.name, + m: m, + } + return m +} diff --git a/internal/metrics/options.go b/internal/metrics/options.go new file mode 100644 index 000000000000..e8f3c9559f96 --- /dev/null +++ b/internal/metrics/options.go @@ -0,0 +1,43 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +// Options encode changes to the options passed to a Metric at creation time. +type Options func(*options) + +type options struct { + enabledCondition func() bool + unit Unit + name string + description string +} + +// WithUnit provides configuration options for a new Metric, providing unit of measure +// information for a new Metric. +func WithUnit(unit Unit) Options { + return func(opts *options) { + opts.unit = unit + } +} + +// WithEnabled allows a metric to be condition enabled if the provided function returns true. +// If disabled, metric operations will do nothing. +func WithEnabled(enabled func() bool) Options { + return func(o *options) { + o.enabledCondition = enabled + } +} + +func metricOptions(name, description string, opts ...Options) (options, Metric) { + o := options{unit: None, name: name, description: description} + for _, opt := range opts { + opt(&o) + } + if o.enabledCondition != nil && !o.enabledCondition() { + return o, &disabled{name: name} + } + return o, nil +} diff --git a/internal/metrics/register.go b/internal/metrics/register.go new file mode 100644 index 000000000000..3e59691289fd --- /dev/null +++ b/internal/metrics/register.go @@ -0,0 +1,160 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +import ( + "context" + "fmt" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + otelprom "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/sdk/metric" + "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/envoyproxy/gateway/internal/envoygateway/config" +) + +// Init initializes and registers the global metrics server. +func Init(svr *config.Server, opts *manager.Options) error { + options := newOptions(svr, opts) + if err := register(options); err != nil { + return err + } + + return nil +} + +func newOptions(svr *config.Server, opts *manager.Options) metricsOptions { + newOpts := metricsOptions{} + + if svr.EnvoyGateway.IfEnablePrometheus() { + newOpts.pullOptions.enable = true + newOpts.pullOptions.address = svr.EnvoyGateway.GetEnvoyGatewayAdminAddress() + newOpts.pullOptions.reg = metricsserver.Registry + opts.Metrics.BindAddress = svr.EnvoyGateway.GetEnvoyGatewayAdminAddress() + } + + for _, config := range svr.EnvoyGateway.GetEnvoyGatewayTelemetry().Metrics.Sinks { + newOpts.pushOptions.sinks = append(newOpts.pushOptions.sinks, metricsSink{ + host: config.Host, + port: config.Port, + protocol: config.Protocol, + }) + } + + return newOpts +} + +// register sets the global metrics registry to the provided Prometheus registerer. +func register(opts metricsOptions) error { + otelOpts := []metric.Option{} + + if err := registerOTELPromExporter(&otelOpts, opts); err != nil { + return err + } + if err := registerOTELHTTPexporter(&otelOpts, opts); err != nil { + return err + } + if err := registerOTELgRPCexporter(&otelOpts, opts); err != nil { + return err + } + otelOpts = append(otelOpts, defs.preAddOptions()...) + + mp := metric.NewMeterProvider(otelOpts...) + otel.SetMeterProvider(mp) + + return nil +} + +// registerOTELPromExporter registers OTEL prometheus exporter (PULL mode). +func registerOTELPromExporter(otelOpts *[]metric.Option, opts metricsOptions) error { + if opts.pullOptions.enable { + promOpts := []otelprom.Option{ + otelprom.WithoutScopeInfo(), + otelprom.WithoutTargetInfo(), + otelprom.WithoutUnits(), + otelprom.WithRegisterer(opts.pullOptions.reg), + otelprom.WithoutCounterSuffixes(), + } + promreader, err := otelprom.New(promOpts...) + if err != nil { + return err + } + + *otelOpts = append(*otelOpts, metric.WithReader(promreader)) + monitoringLogger.Info("initialized metrics pull endpoint", "address", opts.pullOptions.address) + } + + return nil +} + +// registerOTELHTTPexporter registers OTEL HTTP metrics exporter (PUSH mode). +func registerOTELHTTPexporter(otelOpts *[]metric.Option, opts metricsOptions) error { + for _, sink := range opts.pushOptions.sinks { + if sink.protocol == "http" { + address := fmt.Sprintf("%s:%d", sink.host, sink.port) + httpexporter, err := otlpmetrichttp.New( + context.Background(), + otlpmetrichttp.WithEndpoint(address), + otlpmetrichttp.WithInsecure(), + ) + if err != nil { + return err + } + + otelreader := metric.NewPeriodicReader(httpexporter) + *otelOpts = append(*otelOpts, metric.WithReader(otelreader)) + monitoringLogger.Info("initialized otel http metrics push endpoint", "address", address) + } + } + + return nil +} + +// registerOTELgRPCexporter registers OTEL gRPC metrics exporter (PUSH mode). +func registerOTELgRPCexporter(otelOpts *[]metric.Option, opts metricsOptions) error { + for _, sink := range opts.pushOptions.sinks { + if sink.protocol == "grpc" { + address := fmt.Sprintf("%s:%d", sink.host, sink.port) + httpexporter, err := otlpmetricgrpc.New( + context.Background(), + otlpmetricgrpc.WithEndpoint(address), + otlpmetricgrpc.WithInsecure(), + ) + if err != nil { + return err + } + + otelreader := metric.NewPeriodicReader(httpexporter) + *otelOpts = append(*otelOpts, metric.WithReader(otelreader)) + monitoringLogger.Info("initialized otel grpc metrics push endpoint", "address", address) + } + } + + return nil +} + +type metricsOptions struct { + pullOptions struct { + reg prometheus.Registerer + address string + enable bool + } + pushOptions struct { + sinks []metricsSink + } +} + +type metricsSink struct { + protocol string + host string + port int32 +} diff --git a/internal/metrics/sample_counter_test.go b/internal/metrics/sample_counter_test.go new file mode 100644 index 000000000000..050cc557c519 --- /dev/null +++ b/internal/metrics/sample_counter_test.go @@ -0,0 +1,23 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics_test + +import "github.com/envoyproxy/gateway/internal/metrics" + +var ( + irUpdates = metrics.NewCounter( + "ir_updates_total", + "Number of IR updates, by ir type", + ) +) + +func NewCounter() { + // increment on every xds ir update + irUpdates.With(irType.Value("xds")).Increment() + + // xds ir updates double + irUpdates.With(irType.Value("xds")).Record(2) +} diff --git a/internal/metrics/sample_gauge_test.go b/internal/metrics/sample_gauge_test.go new file mode 100644 index 000000000000..6b287ed9ca1a --- /dev/null +++ b/internal/metrics/sample_gauge_test.go @@ -0,0 +1,27 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics_test + +import "github.com/envoyproxy/gateway/internal/metrics" + +var ( + irType = metrics.NewLabel("ir-type") + currentIRsNum = metrics.NewGauge( + "current_irs_queue_num", + "current number of ir in queue, by ir type", + ) +) + +func NewGauge() { + // only the last recorded value (2) will be exported for this gauge + currentIRsNum.With(irType.Value("xds")).Record(1) + currentIRsNum.With(irType.Value("xds")).Record(3) + currentIRsNum.With(irType.Value("xds")).Record(2) + + currentIRsNum.With(irType.Value("infra")).Record(1) + currentIRsNum.With(irType.Value("infra")).Record(3) + currentIRsNum.With(irType.Value("infra")).Record(2) +} diff --git a/internal/metrics/sample_histogram_test.go b/internal/metrics/sample_histogram_test.go new file mode 100644 index 000000000000..b34658fcbe54 --- /dev/null +++ b/internal/metrics/sample_histogram_test.go @@ -0,0 +1,23 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics_test + +import "github.com/envoyproxy/gateway/internal/metrics" + +var ( + method = metrics.NewLabel("method") + + sentBytes = metrics.NewHistogram( + "sent_bytes_total", + "Histogram of sent bytes by method", + []float64{10, 50, 100, 1000, 10000}, + metrics.WithUnit(metrics.Bytes), + ) +) + +func NewHistogram() { + sentBytes.With(method.Value("/request/path/1")).Record(458) +} diff --git a/internal/metrics/units.go b/internal/metrics/units.go new file mode 100644 index 000000000000..1c7b5ff13c20 --- /dev/null +++ b/internal/metrics/units.go @@ -0,0 +1,18 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package metrics + +// Unit encodes the standard name for describing the quantity +// measured by a Metric (if applicable). +type Unit string + +// Predefined units for use with the metrics package. +const ( + None Unit = "1" + Bytes Unit = "By" + Seconds Unit = "s" + Milliseconds Unit = "ms" +) diff --git a/internal/provider/kubernetes/controller.go b/internal/provider/kubernetes/controller.go index dc7f1b3e6d1d..72c30e0ea8fc 100644 --- a/internal/provider/kubernetes/controller.go +++ b/internal/provider/kubernetes/controller.go @@ -61,14 +61,14 @@ const ( ) type gatewayAPIReconciler struct { - client client.Client - log logging.Logger - statusUpdater status.Updater - classController gwapiv1b1.GatewayController - store *kubernetesProviderStore - namespace string - namespaceLabels []string - envoyGateway *egv1a1.EnvoyGateway + client client.Client + log logging.Logger + statusUpdater status.Updater + classController gwapiv1b1.GatewayController + store *kubernetesProviderStore + namespace string + k8sResourceNamespaceLabels []string + envoyGateway *egv1a1.EnvoyGateway resources *message.ProviderResources extGVKs []schema.GroupVersionKind @@ -88,27 +88,27 @@ func newGatewayAPIController(mgr manager.Manager, cfg *config.Server, su status. } } - var namespaceLabels []string + var k8sResourceNamespaceLabels []string byNamespaceSelector := cfg.EnvoyGateway.Provider != nil && cfg.EnvoyGateway.Provider.Kubernetes != nil && cfg.EnvoyGateway.Provider.Kubernetes.Watch != nil && cfg.EnvoyGateway.Provider.Kubernetes.Watch.Type == egv1a1.KubernetesWatchModeTypeNamespaceSelectors && len(cfg.EnvoyGateway.Provider.Kubernetes.Watch.NamespaceSelectors) != 0 if byNamespaceSelector { - namespaceLabels = cfg.EnvoyGateway.Provider.Kubernetes.Watch.NamespaceSelectors + k8sResourceNamespaceLabels = cfg.EnvoyGateway.Provider.Kubernetes.Watch.NamespaceSelectors } r := &gatewayAPIReconciler{ - client: mgr.GetClient(), - log: cfg.Logger, - classController: gwapiv1b1.GatewayController(cfg.EnvoyGateway.Gateway.ControllerName), - namespace: cfg.Namespace, - namespaceLabels: namespaceLabels, - statusUpdater: su, - resources: resources, - extGVKs: extGVKs, - store: newProviderStore(), - envoyGateway: cfg.EnvoyGateway, + client: mgr.GetClient(), + log: cfg.Logger, + classController: gwapiv1b1.GatewayController(cfg.EnvoyGateway.Gateway.ControllerName), + namespace: cfg.Namespace, + k8sResourceNamespaceLabels: k8sResourceNamespaceLabels, + statusUpdater: su, + resources: resources, + extGVKs: extGVKs, + store: newProviderStore(), + envoyGateway: cfg.EnvoyGateway, } c, err := controller.New("gatewayapi", mgr, controller.Options{Reconciler: r}) @@ -450,7 +450,7 @@ func (r *gatewayAPIReconciler) findReferenceGrant(ctx context.Context, from, to } refGrants := refGrantList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var rgs []gwapiv1a2.ReferenceGrant for _, refGrant := range refGrants { ns := refGrant.GetNamespace() @@ -494,7 +494,7 @@ func (r *gatewayAPIReconciler) processGateways(ctx context.Context, acceptedGC * } gateways := gatewayList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var gtws []gwapiv1b1.Gateway for _, gtw := range gateways { ns := gtw.GetNamespace() @@ -1072,8 +1072,8 @@ func (r *gatewayAPIReconciler) removeFinalizer(ctx context.Context, obj client.O func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // Gateway object status updater go func() { - message.HandleSubscription(r.resources.GatewayStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *gwapiv1b1.GatewayStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "gateway-status"}, r.resources.GatewayStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *gwapiv1b1.GatewayStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1082,6 +1082,7 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { gtw := new(gwapiv1b1.Gateway) if err := r.client.Get(ctx, update.Key, gtw); err != nil { r.log.Error(err, "gateway not found", "namespace", gtw.Namespace, "name", gtw.Name) + errChans <- err return } // Set the updated Status and call the status update @@ -1094,8 +1095,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // HTTPRoute object status updater go func() { - message.HandleSubscription(r.resources.HTTPRouteStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *gwapiv1b1.HTTPRouteStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "httproute-status"}, r.resources.HTTPRouteStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *gwapiv1b1.HTTPRouteStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1108,7 +1109,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { h, ok := obj.(*gwapiv1b1.HTTPRoute) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } hCopy := h.DeepCopy() hCopy.Status.Parents = val.Parents @@ -1122,8 +1125,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // GRPCRoute object status updater go func() { - message.HandleSubscription(r.resources.GRPCRouteStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *gwapiv1a2.GRPCRouteStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "grpcroute-status"}, r.resources.GRPCRouteStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *gwapiv1a2.GRPCRouteStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1136,7 +1139,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { h, ok := obj.(*gwapiv1a2.GRPCRoute) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } hCopy := h.DeepCopy() hCopy.Status.Parents = val.Parents @@ -1150,8 +1155,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // TLSRoute object status updater go func() { - message.HandleSubscription(r.resources.TLSRouteStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *gwapiv1a2.TLSRouteStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "tlsroute-status"}, r.resources.TLSRouteStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *gwapiv1a2.TLSRouteStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1164,7 +1169,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { t, ok := obj.(*gwapiv1a2.TLSRoute) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } tCopy := t.DeepCopy() tCopy.Status.Parents = val.Parents @@ -1178,8 +1185,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // TCPRoute object status updater go func() { - message.HandleSubscription(r.resources.TCPRouteStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *gwapiv1a2.TCPRouteStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "tcproute-status"}, r.resources.TCPRouteStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *gwapiv1a2.TCPRouteStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1192,7 +1199,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { t, ok := obj.(*gwapiv1a2.TCPRoute) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } tCopy := t.DeepCopy() tCopy.Status.Parents = val.Parents @@ -1206,8 +1215,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // UDPRoute object status updater go func() { - message.HandleSubscription(r.resources.UDPRouteStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *gwapiv1a2.UDPRouteStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "udproute-status"}, r.resources.UDPRouteStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *gwapiv1a2.UDPRouteStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1220,7 +1229,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { t, ok := obj.(*gwapiv1a2.UDPRoute) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } tCopy := t.DeepCopy() tCopy.Status.Parents = val.Parents @@ -1234,8 +1245,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // EnvoyPatchPolicy object status updater go func() { - message.HandleSubscription(r.resources.EnvoyPatchPolicyStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *egv1a1.EnvoyPatchPolicyStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "envoypatchpolicy-status"}, r.resources.EnvoyPatchPolicyStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *egv1a1.EnvoyPatchPolicyStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1248,7 +1259,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { t, ok := obj.(*egv1a1.EnvoyPatchPolicy) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } tCopy := t.DeepCopy() tCopy.Status = *val @@ -1262,8 +1275,8 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { // ClientTrafficPolicy object status updater go func() { - message.HandleSubscription(r.resources.ClientTrafficPolicyStatuses.Subscribe(ctx), - func(update message.Update[types.NamespacedName, *egv1a1.ClientTrafficPolicyStatus]) { + message.HandleSubscription(message.UpdateMetadata{Component: "provider", Resource: "clienttrafficpolicy-status"}, r.resources.ClientTrafficPolicyStatuses.Subscribe(ctx), + func(update message.Update[types.NamespacedName, *egv1a1.ClientTrafficPolicyStatus], errChans chan error) { // skip delete updates. if update.Delete { return @@ -1276,7 +1289,9 @@ func (r *gatewayAPIReconciler) subscribeAndUpdateStatus(ctx context.Context) { Mutator: status.MutatorFunc(func(obj client.Object) client.Object { t, ok := obj.(*egv1a1.ClientTrafficPolicy) if !ok { - panic(fmt.Sprintf("unsupported object type %T", obj)) + err := fmt.Errorf("unsupported object type %T", obj) + errChans <- err + panic(err) } tCopy := t.DeepCopy() tCopy.Status = *val @@ -1305,7 +1320,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M predicate.ResourceVersionChangedPredicate{}, predicate.NewPredicateFuncs(r.hasManagedClass), } - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { epPredicates = append(epPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1318,7 +1333,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch Gateway CRUDs and reconcile affected GatewayClass. gPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.validateGatewayForReconcile)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { gPredicates = append(gPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1334,7 +1349,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch HTTPRoute CRUDs and process affected Gateways. httprPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { httprPredicates = append(httprPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1350,7 +1365,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch GRPCRoute CRUDs and process affected Gateways. grpcrPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { grpcrPredicates = append(grpcrPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1366,7 +1381,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch TLSRoute CRUDs and process affected Gateways. tlsrPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { tlsrPredicates = append(tlsrPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1382,7 +1397,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch UDPRoute CRUDs and process affected Gateways. udprPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { udprPredicates = append(udprPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1398,7 +1413,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch TCPRoute CRUDs and process affected Gateways. tcprPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { tcprPredicates = append(tcprPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1414,7 +1429,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch Service CRUDs and process affected *Route objects. servicePredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.validateServiceForReconcile)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { servicePredicates = append(servicePredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1443,7 +1458,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch EndpointSlice CRUDs and process affected *Route objects. esPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.validateEndpointSliceForReconcile)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { esPredicates = append(esPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1457,7 +1472,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch Node CRUDs to update Gateway Address exposed by Service of type NodePort. // Node creation/deletion and ExternalIP updates would require update in the Gateway nPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.handleNode)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { nPredicates = append(nPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } // resource address. @@ -1471,7 +1486,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch Secret CRUDs and process affected Gateways. secretPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.validateSecretForReconcile)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { secretPredicates = append(secretPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1484,7 +1499,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch ReferenceGrant CRUDs and process affected Gateways. rgPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { rgPredicates = append(rgPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1500,7 +1515,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch Deployment CRUDs and process affected Gateways. dPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.validateDeploymentForReconcile)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { dPredicates = append(dPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1513,7 +1528,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch AuthenticationFilter CRUDs and enqueue associated HTTPRoute objects. afPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.httpRoutesForAuthenticationFilter)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { afPredicates = append(afPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if err := c.Watch( @@ -1525,7 +1540,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M } rfPredicates := []predicate.Predicate{predicate.NewPredicateFuncs(r.httpRoutesForRateLimitFilter)} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { rfPredicates = append(rfPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } // Watch RateLimitFilter CRUDs and enqueue associated HTTPRoute objects. @@ -1539,7 +1554,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch EnvoyPatchPolicy if enabled in config eppPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { eppPredicates = append(eppPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } if r.envoyGateway.ExtensionAPIs != nil && r.envoyGateway.ExtensionAPIs.EnableEnvoyPatchPolicy { @@ -1555,7 +1570,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch ClientTrafficPolicy ctpPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { ctpPredicates = append(ctpPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } @@ -1571,7 +1586,7 @@ func (r *gatewayAPIReconciler) watchResources(ctx context.Context, mgr manager.M // Watch any additional GVKs from the registered extension. uPredicates := []predicate.Predicate{} - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { uPredicates = append(uPredicates, predicate.NewPredicateFuncs(r.hasMatchingNamespaceLabels)) } for _, gvk := range r.extGVKs { diff --git a/internal/provider/kubernetes/filters.go b/internal/provider/kubernetes/filters.go index c7d402a85933..68446e7986d9 100644 --- a/internal/provider/kubernetes/filters.go +++ b/internal/provider/kubernetes/filters.go @@ -21,7 +21,7 @@ func (r *gatewayAPIReconciler) getAuthenticationFilters(ctx context.Context) ([] } authens := authenList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var as []egv1a1.AuthenticationFilter for _, a := range authens { ns := a.GetNamespace() @@ -49,7 +49,7 @@ func (r *gatewayAPIReconciler) getRateLimitFilters(ctx context.Context) ([]egv1a } rateLimits := rateLimitList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var rls []egv1a1.RateLimitFilter for _, rl := range rateLimits { ns := rl.GetNamespace() @@ -81,7 +81,7 @@ func (r *gatewayAPIReconciler) getExtensionRefFilters(ctx context.Context) ([]un } uExtResources := uExtResourceList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var extRs []unstructured.Unstructured for _, extR := range uExtResources { ns := extR.GetNamespace() diff --git a/internal/provider/kubernetes/kubernetes.go b/internal/provider/kubernetes/kubernetes.go index 820868eeafb7..f9fc7c4818d1 100644 --- a/internal/provider/kubernetes/kubernetes.go +++ b/internal/provider/kubernetes/kubernetes.go @@ -15,12 +15,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/manager" - metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "github.com/envoyproxy/gateway/api/v1alpha1" "github.com/envoyproxy/gateway/internal/envoygateway" "github.com/envoyproxy/gateway/internal/envoygateway/config" "github.com/envoyproxy/gateway/internal/message" + "github.com/envoyproxy/gateway/internal/metrics" "github.com/envoyproxy/gateway/internal/status" ) @@ -41,9 +41,14 @@ func New(cfg *rest.Config, svr *config.Server, resources *message.ProviderResour LeaderElection: false, HealthProbeBindAddress: ":8081", LeaderElectionID: "5b9825d2.gateway.envoyproxy.io", - Metrics: metricsserver.Options{ - BindAddress: ":8080", - }, + } + + if address := svr.EnvoyGateway.GetEnvoyGatewayDebug().Address; address != nil { + mgrOpts.PprofBindAddress = fmt.Sprintf("%s:%d", address.Host, address.Port) + } + + if err := metrics.Init(svr, &mgrOpts); err != nil { + return nil, err } // TODO: implement config validation on the watch mode config diff --git a/internal/provider/kubernetes/predicates.go b/internal/provider/kubernetes/predicates.go index 746f99c5ee32..b6381b9b532c 100644 --- a/internal/provider/kubernetes/predicates.go +++ b/internal/provider/kubernetes/predicates.go @@ -61,7 +61,7 @@ type NamespaceGetter interface { GetNamespace() string } -// checkObjectNamespaceLabels checks if labels of namespace of the object is a subset of namespaceLabels +// checkObjectNamespaceLabels checks if labels of namespace of the object is a subset of k8sResourceNamespaceLabels // TODO: check if param can be an interface, so the caller doesn't need to get the namespace before calling // this function. func (r *gatewayAPIReconciler) checkObjectNamespaceLabels(nsString string) (bool, error) { @@ -77,7 +77,7 @@ func (r *gatewayAPIReconciler) checkObjectNamespaceLabels(nsString string) (bool ); err != nil { return false, err } - return containAll(ns.Labels, r.namespaceLabels), nil + return containAll(ns.Labels, r.k8sResourceNamespaceLabels), nil } func containAll(labels map[string]string, labelsToCheck []string) bool { @@ -350,7 +350,7 @@ func (r *gatewayAPIReconciler) httpRoutesForRateLimitFilter(obj client.Object) b } func (r *gatewayAPIReconciler) filterHTTPRoutesByNamespaceLabels(httpRoutes []gwapiv1b1.HTTPRoute) []gwapiv1b1.HTTPRoute { - if len(r.namespaceLabels) == 0 { + if len(r.k8sResourceNamespaceLabels) == 0 { return httpRoutes } diff --git a/internal/provider/kubernetes/predicates_test.go b/internal/provider/kubernetes/predicates_test.go index 6d156f083ed3..94c3a0bf8ba7 100644 --- a/internal/provider/kubernetes/predicates_test.go +++ b/internal/provider/kubernetes/predicates_test.go @@ -67,28 +67,28 @@ func TestGatewayClassHasMatchingController(t *testing.T) { func TestGatewayClassHasMatchingNamespaceLabels(t *testing.T) { ns := "namespace-1" testCases := []struct { - name string - labels []string - namespaceLabels []string - expect bool + name string + labels []string + k8sResourceNamespaceLabels []string + expect bool }{ { - name: "matching one label when namespace has one label", - labels: []string{"label-1"}, - namespaceLabels: []string{"label-1"}, - expect: true, + name: "matching one label when namespace has one label", + labels: []string{"label-1"}, + k8sResourceNamespaceLabels: []string{"label-1"}, + expect: true, }, { - name: "matching one label when namespace has two labels", - labels: []string{"label-1"}, - namespaceLabels: []string{"label-1", "label-2"}, - expect: true, + name: "matching one label when namespace has two labels", + labels: []string{"label-1"}, + k8sResourceNamespaceLabels: []string{"label-1", "label-2"}, + expect: true, }, { - name: "namespace has less labels than the specified labels", - labels: []string{"label-1", "label-2"}, - namespaceLabels: []string{"label-1"}, - expect: false, + name: "namespace has less labels than the specified labels", + labels: []string{"label-1", "label-2"}, + k8sResourceNamespaceLabels: []string{"label-1"}, + expect: false, }, } @@ -97,15 +97,15 @@ func TestGatewayClassHasMatchingNamespaceLabels(t *testing.T) { for _, tc := range testCases { tc := tc - namespaceLabelsToMap := make(map[string]string) - for _, l := range tc.namespaceLabels { - namespaceLabelsToMap[l] = "" + k8sResourceNamespaceLabelsToMap := make(map[string]string) + for _, l := range tc.k8sResourceNamespaceLabels { + k8sResourceNamespaceLabelsToMap[l] = "" } r := gatewayAPIReconciler{ - classController: v1alpha1.GatewayControllerName, - namespaceLabels: tc.labels, - log: logger, + classController: v1alpha1.GatewayControllerName, + k8sResourceNamespaceLabels: tc.labels, + log: logger, client: fakeclient.NewClientBuilder(). WithScheme(envoygateway.GetScheme()). WithObjects(&corev1.Namespace{ @@ -113,7 +113,7 @@ func TestGatewayClassHasMatchingNamespaceLabels(t *testing.T) { Kind: "Namespace", APIVersion: "v1", }, - ObjectMeta: v1.ObjectMeta{Name: ns, Labels: namespaceLabelsToMap}, + ObjectMeta: v1.ObjectMeta{Name: ns, Labels: k8sResourceNamespaceLabelsToMap}, }). Build(), } diff --git a/internal/provider/kubernetes/routes.go b/internal/provider/kubernetes/routes.go index 6edaec0b6b65..53f5779c7ab5 100644 --- a/internal/provider/kubernetes/routes.go +++ b/internal/provider/kubernetes/routes.go @@ -35,7 +35,7 @@ func (r *gatewayAPIReconciler) processTLSRoutes(ctx context.Context, gatewayName } tlsRoutes := tlsRouteList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var rts []gwapiv1a2.TLSRoute for _, rt := range tlsRoutes { ns := rt.GetNamespace() @@ -136,7 +136,7 @@ func (r *gatewayAPIReconciler) processGRPCRoutes(ctx context.Context, gatewayNam } grpcRoutes := grpcRouteList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var grs []gwapiv1a2.GRPCRoute for _, gr := range grpcRoutes { ns := gr.GetNamespace() @@ -308,7 +308,7 @@ func (r *gatewayAPIReconciler) processHTTPRoutes(ctx context.Context, gatewayNam } httpRoutes := httpRouteList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var hrs []gwapiv1b1.HTTPRoute for _, hr := range httpRoutes { ns := hr.GetNamespace() @@ -505,7 +505,7 @@ func (r *gatewayAPIReconciler) processTCPRoutes(ctx context.Context, gatewayName } tcpRoutes := tcpRouteList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var trs []gwapiv1a2.TCPRoute for _, tr := range tcpRoutes { ns := tr.GetNamespace() @@ -586,7 +586,7 @@ func (r *gatewayAPIReconciler) processUDPRoutes(ctx context.Context, gatewayName } udpRoutes := udpRouteList.Items - if len(r.namespaceLabels) != 0 { + if len(r.k8sResourceNamespaceLabels) != 0 { var urs []gwapiv1a2.UDPRoute for _, ur := range udpRoutes { ns := ur.GetNamespace() diff --git a/internal/xds/bootstrap/bootstrap_test.go b/internal/xds/bootstrap/bootstrap_test.go index bbee8f020265..678317dbcca7 100644 --- a/internal/xds/bootstrap/bootstrap_test.go +++ b/internal/xds/bootstrap/bootstrap_test.go @@ -27,7 +27,7 @@ func TestGetRenderedBootstrapConfig(t *testing.T) { { name: "enable-prometheus", proxyMetrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{}, + Prometheus: &egv1a1.ProxyPrometheusProvider{}, }, }, { @@ -65,7 +65,7 @@ func TestGetRenderedBootstrapConfig(t *testing.T) { Value: "cluster", }, }, - Prometheus: &egv1a1.PrometheusProvider{}, + Prometheus: &egv1a1.ProxyPrometheusProvider{}, }, }, } diff --git a/internal/xds/server/runner/runner.go b/internal/xds/server/runner/runner.go index 7147eb200a16..e4e41c0ec342 100644 --- a/internal/xds/server/runner/runner.go +++ b/internal/xds/server/runner/runner.go @@ -133,8 +133,8 @@ func registerServer(srv serverv3.Server, g *grpc.Server) { func (r *Runner) subscribeAndTranslate(ctx context.Context) { // Subscribe to resources - message.HandleSubscription(r.Xds.Subscribe(ctx), - func(update message.Update[string, *xdstypes.ResourceVersionTable]) { + message.HandleSubscription(message.UpdateMetadata{Component: r.Name()}, r.Xds.Subscribe(ctx), + func(update message.Update[string, *xdstypes.ResourceVersionTable], errChans chan error) { key := update.Key val := update.Value @@ -145,6 +145,7 @@ func (r *Runner) subscribeAndTranslate(ctx context.Context) { } else if val != nil && val.XdsResources != nil { if r.cache == nil { r.Logger.Error(err, "failed to init snapshot cache") + errChans <- err } else { // Update snapshot cache err = r.cache.GenerateNewSnapshot(key, val.XdsResources) @@ -152,6 +153,7 @@ func (r *Runner) subscribeAndTranslate(ctx context.Context) { } if err != nil { r.Logger.Error(err, "failed to generate a snapshot") + errChans <- err } }, ) diff --git a/internal/xds/translator/extension.go b/internal/xds/translator/extension.go index b8eae0c5f7a6..8a9d8e157b43 100644 --- a/internal/xds/translator/extension.go +++ b/internal/xds/translator/extension.go @@ -12,6 +12,7 @@ import ( "errors" "fmt" "reflect" + "time" clusterv3 "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" listenerv3 "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" @@ -43,12 +44,17 @@ func processExtensionPostRouteHook(route *routev3.Route, vHost *routev3.VirtualH for refIdx, ref := range irRoute.ExtensionRefs { unstructuredResources[refIdx] = ref.Object } + + startHookTime := time.Now() modifiedRoute, err := extRouteHookClient.PostRouteModifyHook( route, vHost.Domains, unstructuredResources, ) + extensionManagerPostHookTimeSeconds.With(targetLabel.Value(routeTarget)).Record(time.Since(startHookTime).Seconds()) + extensionManagerPostHookCalls.With(targetLabel.Value(routeTarget)).Increment() if err != nil { + extensionManagerPostHookCallErrors.With(targetLabel.Value(routeTarget)).Increment() // Maybe logging the error is better here, but this only happens when an extension is in-use // so if modification fails then we should probably treat that as a serious problem. return err @@ -75,8 +81,13 @@ func processExtensionPostVHostHook(vHost *routev3.VirtualHost, em *extensionType if extVHHookClient == nil { return nil } + + startHookTime := time.Now() modifiedVH, err := extVHHookClient.PostVirtualHostModifyHook(vHost) + extensionManagerPostHookTimeSeconds.With(targetLabel.Value(virtualHostTarget)).Record(time.Since(startHookTime).Seconds()) + extensionManagerPostHookCalls.With(targetLabel.Value(virtualHostTarget)).Increment() if err != nil { + extensionManagerPostHookCallErrors.With(targetLabel.Value(virtualHostTarget)).Increment() // Maybe logging the error is better here, but this only happens when an extension is in-use // so if modification fails then we should probably treat that as a serious problem. return err @@ -102,8 +113,12 @@ func processExtensionPostListenerHook(tCtx *types.ResourceVersionTable, xdsListe extManager := *em extListenerHookClient := extManager.GetPostXDSHookClient(v1alpha1.XDSHTTPListener) if extListenerHookClient != nil { + startHookTime := time.Now() modifiedListener, err := extListenerHookClient.PostHTTPListenerModifyHook(xdsListener) + extensionManagerPostHookTimeSeconds.With(targetLabel.Value(listenerTarget)).Record(time.Since(startHookTime).Seconds()) + extensionManagerPostHookCalls.With(targetLabel.Value(listenerTarget)).Increment() if err != nil { + extensionManagerPostHookCallErrors.With(targetLabel.Value(listenerTarget)).Increment() return err } else if modifiedListener != nil { // Use the resource table to update the listener with the modified version returned by the extension @@ -152,9 +167,12 @@ func processExtensionPostTranslationHook(tCtx *types.ResourceVersionTable, em *e for idx, secret := range secrets { oldSecrets[idx] = secret.(*tlsv3.Secret) } - + startHookTime := time.Now() newClusters, newSecrets, err := extensionInsertHookClient.PostTranslateModifyHook(oldClusters, oldSecrets) + extensionManagerPostHookTimeSeconds.With(targetLabel.Value(clusterTarget)).Record(time.Since(startHookTime).Seconds()) + extensionManagerPostHookCalls.With(targetLabel.Value(clusterTarget)).Increment() if err != nil { + extensionManagerPostHookCallErrors.With(targetLabel.Value(clusterTarget)).Increment() return err } diff --git a/internal/xds/translator/metrics.go b/internal/xds/translator/metrics.go new file mode 100644 index 000000000000..a2c0cd9133fd --- /dev/null +++ b/internal/xds/translator/metrics.go @@ -0,0 +1,37 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package translator + +import "github.com/envoyproxy/gateway/internal/metrics" + +var ( + // metrics definitions + extensionManagerPostHookTimeSeconds = metrics.NewHistogram( + "extension_manager_post_hook_time_seconds", + "How long in seconds a post hook called in extension manager.", + []float64{0.001, 0.01, 0.1, 1, 5, 10}, + ) + + extensionManagerPostHookCalls = metrics.NewCounter( + "extension_manager_post_hook_calls_total", + "Total number of the post hook calls in extension manager.", + ) + + extensionManagerPostHookCallErrors = metrics.NewCounter( + "extension_manager_post_hook_call_errors_total", + "Total number of the post hook call errors in extension manager.", + ) + + // metrics label definitions + targetLabel = metrics.NewLabel("target") +) + +const ( + routeTarget = "route" + virtualHostTarget = "virtualHost" + listenerTarget = "listener" + clusterTarget = "cluster" +) diff --git a/internal/xds/translator/runner/runner.go b/internal/xds/translator/runner/runner.go index 17ff6c8619a7..ca9ce17e8781 100644 --- a/internal/xds/translator/runner/runner.go +++ b/internal/xds/translator/runner/runner.go @@ -49,8 +49,8 @@ func (r *Runner) Start(ctx context.Context) (err error) { func (r *Runner) subscribeAndTranslate(ctx context.Context) { // Subscribe to resources - message.HandleSubscription(r.XdsIR.Subscribe(ctx), - func(update message.Update[string, *ir.Xds]) { + message.HandleSubscription(message.UpdateMetadata{Component: r.Name()}, r.XdsIR.Subscribe(ctx), + func(update message.Update[string, *ir.Xds], errChans chan error) { r.Logger.Info("received an update") key := update.Key val := update.Value @@ -80,6 +80,7 @@ func (r *Runner) subscribeAndTranslate(ctx context.Context) { result, err := t.Translate(val) if err != nil { r.Logger.Error(err, "failed to translate xds ir") + errChans <- err return } diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 73bff134936c..bcb86738646b 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -201,7 +201,9 @@ EnvoyGateway is the schema for the envoygateways API. | `gateway` _[Gateway](#gateway)_ | Gateway defines desired Gateway API specific configuration. If unset, default configuration parameters will apply. | | `provider` _[EnvoyGatewayProvider](#envoygatewayprovider)_ | Provider defines the desired provider and provider-specific configuration. If unspecified, the Kubernetes provider is used with default configuration parameters. | | `logging` _[EnvoyGatewayLogging](#envoygatewaylogging)_ | Logging defines logging parameters for Envoy Gateway. | +| `telemetry` _[EnvoyGatewayTelemetry](#envoygatewaytelemetry)_ | Telemetry defines telemetry related configurations for envoy gateway. | | `admin` _[EnvoyGatewayAdmin](#envoygatewayadmin)_ | Admin defines the desired admin related abilities. If unspecified, the Admin is used with default configuration parameters. | +| `debug` _[EnvoyGatewayDebug](#envoygatewaydebug)_ | Debug defines the desired debug related abilities. If unspecified, the debug will not be running, including pprof, dump config etc. | | `rateLimit` _[RateLimit](#ratelimit)_ | RateLimit defines the configuration associated with the Rate Limit service deployed by Envoy Gateway required to implement the Global Rate limiting functionality. The specific rate limit service used here is the reference implementation in Envoy. For more details visit https://github.com/envoyproxy/ratelimit. This configuration is unneeded for "Local" rate limiting. | | `extensionManager` _[ExtensionManager](#extensionmanager)_ | ExtensionManager defines an extension manager to register for the Envoy Gateway Control Plane. | | `extensionApis` _[ExtensionAPISettings](#extensionapisettings)_ | ExtensionAPIs defines the settings related to specific Gateway API Extensions implemented by Envoy Gateway | @@ -220,7 +222,6 @@ _Appears in:_ | Field | Description | | --- | --- | | `address` _[EnvoyGatewayAdminAddress](#envoygatewayadminaddress)_ | Address defines the address of Envoy Gateway Admin Server. | -| `debug` _boolean_ | Debug defines if enable the /debug endpoint of Envoy Gateway. | #### EnvoyGatewayAdminAddress @@ -253,6 +254,37 @@ _Appears in:_ | `infrastructure` _[EnvoyGatewayInfrastructureProvider](#envoygatewayinfrastructureprovider)_ | Infrastructure defines the desired infrastructure provider. This provider is used to specify the provider to be used to provide an environment to deploy the out resources like the Envoy Proxy data plane. | +#### EnvoyGatewayDebug + + + +EnvoyGatewayDebug defines the Envoy Gateway Debug configuration. + +_Appears in:_ +- [EnvoyGateway](#envoygateway) +- [EnvoyGatewaySpec](#envoygatewayspec) + +| Field | Description | +| --- | --- | +| `dumpConfig` _boolean_ | DumpConfig defines if dump the Envoy Gateway config in logs. | +| `address` _[EnvoyGatewayDebugAddress](#envoygatewaydebugaddress)_ | Address defines the address of Envoy Gateway Debug Server. Pprof will use the debug address, if you set it to non-nil. | + + +#### EnvoyGatewayDebugAddress + + + +EnvoyGatewayDebugAddress defines the Envoy Gateway Debug Address configuration. + +_Appears in:_ +- [EnvoyGatewayDebug](#envoygatewaydebug) + +| Field | Description | +| --- | --- | +| `port` _integer_ | Port defines the port the debug server is exposed on. | +| `host` _string_ | Host defines the debug server hostname. | + + #### EnvoyGatewayFileResourceProvider @@ -336,6 +368,52 @@ _Appears in:_ | `level` _object (keys:[EnvoyGatewayLogComponent](#envoygatewaylogcomponent), values:[LogLevel](#loglevel))_ | Level is the logging level. If unspecified, defaults to "info". EnvoyGatewayLogComponent options: default/provider/gateway-api/xds-translator/xds-server/infrastructure/global-ratelimit. LogLevel options: debug/info/error/warn. | +#### EnvoyGatewayMetricSink + + + +EnvoyGatewayMetricSink defines control plane metric sinks where metrics are sent to. + +_Appears in:_ +- [EnvoyGatewayMetrics](#envoygatewaymetrics) + +| Field | Description | +| --- | --- | +| `type` _[MetricSinkType](#metricsinktype)_ | Type defines the metric sink type. EG control plane currently supports OpenTelemetry. | +| `host` _string_ | Host define the sink service hostname. | +| `protocol` _string_ | Protocol define the sink service protocol. | +| `port` _integer_ | Port defines the port the sink service is exposed on. | + + +#### EnvoyGatewayMetrics + + + +EnvoyGatewayMetrics defines control plane push/pull metrics configurations. + +_Appears in:_ +- [EnvoyGatewayTelemetry](#envoygatewaytelemetry) + +| Field | Description | +| --- | --- | +| `sinks` _[EnvoyGatewayMetricSink](#envoygatewaymetricsink) array_ | Sinks defines the metric sinks where metrics are sent to. | +| `prometheus` _[EnvoyGatewayPrometheusProvider](#envoygatewayprometheusprovider)_ | Prometheus defines the configuration for prometheus endpoint. | + + +#### EnvoyGatewayPrometheusProvider + + + +EnvoyGatewayPrometheusProvider will expose prometheus endpoint in pull mode. + +_Appears in:_ +- [EnvoyGatewayMetrics](#envoygatewaymetrics) + +| Field | Description | +| --- | --- | +| `enable` _boolean_ | Enable defines if enables the prometheus metrics in pull mode. Default is true. | + + #### EnvoyGatewayProvider @@ -382,12 +460,29 @@ _Appears in:_ | `gateway` _[Gateway](#gateway)_ | Gateway defines desired Gateway API specific configuration. If unset, default configuration parameters will apply. | | `provider` _[EnvoyGatewayProvider](#envoygatewayprovider)_ | Provider defines the desired provider and provider-specific configuration. If unspecified, the Kubernetes provider is used with default configuration parameters. | | `logging` _[EnvoyGatewayLogging](#envoygatewaylogging)_ | Logging defines logging parameters for Envoy Gateway. | +| `telemetry` _[EnvoyGatewayTelemetry](#envoygatewaytelemetry)_ | Telemetry defines telemetry related configurations for envoy gateway. | | `admin` _[EnvoyGatewayAdmin](#envoygatewayadmin)_ | Admin defines the desired admin related abilities. If unspecified, the Admin is used with default configuration parameters. | +| `debug` _[EnvoyGatewayDebug](#envoygatewaydebug)_ | Debug defines the desired debug related abilities. If unspecified, the debug will not be running, including pprof, dump config etc. | | `rateLimit` _[RateLimit](#ratelimit)_ | RateLimit defines the configuration associated with the Rate Limit service deployed by Envoy Gateway required to implement the Global Rate limiting functionality. The specific rate limit service used here is the reference implementation in Envoy. For more details visit https://github.com/envoyproxy/ratelimit. This configuration is unneeded for "Local" rate limiting. | | `extensionManager` _[ExtensionManager](#extensionmanager)_ | ExtensionManager defines an extension manager to register for the Envoy Gateway Control Plane. | | `extensionApis` _[ExtensionAPISettings](#extensionapisettings)_ | ExtensionAPIs defines the settings related to specific Gateway API Extensions implemented by Envoy Gateway | +#### EnvoyGatewayTelemetry + + + +EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. Control plane will focus on metrics observability telemetry and tracing telemetry later. + +_Appears in:_ +- [EnvoyGateway](#envoygateway) +- [EnvoyGatewaySpec](#envoygatewayspec) + +| Field | Description | +| --- | --- | +| `metrics` _[EnvoyGatewayMetrics](#envoygatewaymetrics)_ | Metrics defines metrics configuration for envoy gateway. | + + #### EnvoyJSONPatchConfig @@ -888,17 +983,6 @@ _Appears in:_ | `value` _string_ | Value defines the hard-coded value to add to each span. | -#### LogComponent - -_Underlying type:_ `string` - -LogComponent defines a component that supports a configured logging level. - -_Appears in:_ -- [ProxyLogging](#proxylogging) - - - #### LogLevel _Underlying type:_ `string` @@ -959,6 +1043,7 @@ _Underlying type:_ `string` _Appears in:_ +- [EnvoyGatewayMetricSink](#envoygatewaymetricsink) - [MetricSink](#metricsink) @@ -994,17 +1079,6 @@ _Appears in:_ | `port` _integer_ | Port defines the port the service is exposed on. | -#### PrometheusProvider - - - - - -_Appears in:_ -- [ProxyMetrics](#proxymetrics) - - - #### ProviderType _Underlying type:_ `string` @@ -1116,6 +1190,17 @@ _Appears in:_ | `value` _string_ | Value is a YAML string of the bootstrap. | +#### ProxyLogComponent + +_Underlying type:_ `string` + +ProxyLogComponent defines a component that supports a configured logging level. + +_Appears in:_ +- [ProxyLogging](#proxylogging) + + + #### ProxyLogging @@ -1127,7 +1212,7 @@ _Appears in:_ | Field | Description | | --- | --- | -| `level` _object (keys:[LogComponent](#logcomponent), values:[LogLevel](#loglevel))_ | Level is a map of logging level per component, where the component is the key and the log level is the value. If unspecified, defaults to "default: warn". | +| `level` _object (keys:[ProxyLogComponent](#proxylogcomponent), values:[LogLevel](#loglevel))_ | Level is a map of logging level per component, where the component is the key and the log level is the value. If unspecified, defaults to "default: warn". | #### ProxyMetrics @@ -1141,12 +1226,23 @@ _Appears in:_ | Field | Description | | --- | --- | -| `prometheus` _[PrometheusProvider](#prometheusprovider)_ | Prometheus defines the configuration for Admin endpoint `/stats/prometheus`. | +| `prometheus` _[ProxyPrometheusProvider](#proxyprometheusprovider)_ | Prometheus defines the configuration for Admin endpoint `/stats/prometheus`. | | `sinks` _[MetricSink](#metricsink) array_ | Sinks defines the metric sinks where metrics are sent to. | | `matches` _[Match](#match) array_ | Matches defines configuration for selecting specific metrics instead of generating all metrics stats that are enabled by default. This helps reduce CPU and memory overhead in Envoy, but eliminating some stats may after critical functionality. Here are the stats that we strongly recommend not disabling: `cluster_manager.warming_clusters`, `cluster..membership_total`,`cluster..membership_healthy`, `cluster..membership_degraded`,reference https://github.com/envoyproxy/envoy/issues/9856, https://github.com/envoyproxy/envoy/issues/14610 | | `enableVirtualHostStats` _boolean_ | EnableVirtualHostStats enables envoy stat metrics for virtual hosts. | +#### ProxyPrometheusProvider + + + + + +_Appears in:_ +- [ProxyMetrics](#proxymetrics) + + + #### ProxyTelemetry diff --git a/site/content/en/latest/design/eg-metrics.md b/site/content/en/latest/design/eg-metrics.md new file mode 100644 index 000000000000..7837e8e69d15 --- /dev/null +++ b/site/content/en/latest/design/eg-metrics.md @@ -0,0 +1,969 @@ +--- +date: 2023-10-10 +title: "Control Plane Observability: Metrics" +author: Xunzhuo Liu +linkTitle: "Control Plane Observability: Metrics" +--- + +This document aims to cover all aspects of envoy gateway control plane metrics observability. + +{{% alert title="Note" color="secondary" %}} +**Data plane** observability (while important) is outside of scope for this document. +{{% /alert %}} + +## Current State + +At present, the Envoy Gateway control plane provides logs and controller-runtime metrics, without traces. Logs are managed through our proprietary library (`internal/logging`, a shim to `zap`) and are written to `/dev/stdout`. + +The absence of comprehensive and robust control plane metrics observability hinders the effective monitoring of Envoy Gateway in a production environment, a critical requirement before deploying Envoy Gateway into production. + +## Goals + +Our objectives include: + ++ Supporting **PULL** mode for Prometheus metrics and exposing these metrics on the admin address. ++ Supporting **PUSH** mode for Prometheus metrics, thereby sending metrics to the Open Telemetry Stats sink. ++ Offering a **COMMON** metrics library so developers can effortlessly add new metrics/labels to each Envoy Gateway component. ++ Providing **BASIC** metrics produced by each Envoy Gateway component, including: + + Provider + + Resource Translator + + Infra Manager + + xDS Translator + + Extension Manager + +## Non-Goals + +Our non-goals include: + ++ Supporting other stats sinks. + +## Use-Cases + +The use-cases include: + ++ Exposing Prometheus metrics in the Envoy Gateway Control Plane. ++ Pushing Envoy Gateway Control Plane metrics via the Open Telemetry Sink. + +## Design + +### Standards + +Our metrics, and traces in the future, will be built upon the [OpenTelemetry](https://opentelemetry.io/) standards. All metrics will be configured via the [OpenTelemetry SDK](https://opentelemetry.io/docs/specs/otel/metrics/sdk/), which offers neutral libraries that can be connected to various backends. + +This approach allows the Envoy Gateway code to concentrate on the crucial aspect - generating the metrics - and delegate all other tasks to systems designed for telemetry ingestion. + +### Attributes + +OpenTelemetry defines a set of [Semantic Conventions](https://opentelemetry.io/docs/concepts/semantic-conventions/), including [Kubernetes specific ones](https://opentelemetry.io/docs/specs/otel/resource/semantic_conventions/k8s/). + +These attributes can be expressed in logs (as keys of structured logs), traces (as attributes), and metrics (as labels). + +We aim to use attributes consistently where applicable. Where possible, these should adhere to codified Semantic Conventions; when not possible, they should maintain consistency across the project. + +### Extensibility + +Envoy Gateway supports both **PULL/PUSH** mode metrics, with Metrics exported via Prometheus by default. + +Additionally, Envoy Gateway can export metrics using both the [OTEL gRPC metrics exporter](https://opentelemetry.io/docs/specs/otel/metrics/sdk_exporters/otlp/#general) and [OTEL HTTP metrics exporter](https://opentelemetry.io/docs/specs/otel/metrics/sdk_exporters/otlp/#general), which pushes metrics by grpc/http to a remote OTEL collector. + +Users can extend these in two ways: + +#### Downstream Collection + +Based on the exported data, other tools can collect, process, and export telemetry as needed. Some examples include: + ++ Metrics in **PULL** mode: The OTEL collector can scrape Prometheus and export to X. ++ Metrics in **PUSH** mode: The OTEL collector can receive OTEL gRPC/HTTP exporter metrics and export to X. + +While the examples above involve OTEL collectors, there are numerous other systems available. + +#### Vendor extensions + +The OTEL libraries allow for the registration of Providers/Handlers. While we will offer the default ones (PULL via Prometheus, PUSH via OTEL HTTP metrics exporter) mentioned in Envoy Gateway's extensibility, we can easily allow custom builds of Envoy Gateway to plug in alternatives if the default options don't meet their needs. + +For instance, users may prefer to write metrics over the OTLP gRPC metrics exporter instead of the HTTP metrics exporter. This is perfectly acceptable -- and almost impossible to prevent. The OTEL has ways to register their providers/exporters, and Envoy Gateway can ensure its usage is such that it's not overly difficult to swap out a different provider/exporter. + +### Stability + +Observability is, in essence, a user-facing API. Its primary purpose is to be consumed - by both humans and tooling. Therefore, having well-defined guarantees around their formats is crucial. + +Please note that this refers only to the contents of the telemetry - what we emit, the names of things, semantics, etc. Other settings like Prometheus vs OTLP, JSON vs plaintext, logging levels, etc., are not considered. + +I propose the following: + +#### Metrics + +Metrics offer the greatest potential for providing guarantees. They often directly influence alerts and dashboards, making changes highly impactful. This contrasts with traces and logs, which are often used for ad-hoc analysis, where minor changes to information can be easily understood by a human. + +Moreover, there is precedent for this: [Kubernetes Metrics Lifecycle](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#metric-lifecycle) has well-defined processes, and Envoy Gateway's dataplane (Envoy Proxy) metrics are de facto stable. + +Currently, all Envoy Gateway metrics lack defined stability. I suggest we categorize all existing metrics as either: + ++ ***Deprecated***: a metric that is intended to be phased out. ++ ***Experimental***: a metric that is off by default. ++ ***Alpha***: a metric that is on by default. + +We should aim to promote a core set of metrics to **Stable** within a few releases. + +## Library + +Envoy Gateway should offer a metrics library abstraction wrapper, effectively hiding OTEL from the rest of the codebase. + +### Deep Dive + +Although OpenTelemetry has a fairly user-friendly API, I believe we still benefit from a wrapper, which provides the following advantages: + ++ Workarounds for various library limitations (histograms, gauges, counters). ++ Codification of best practices; while we don't do much today, this may become more important as we start to define [Stability](#stability). ++ Provision of optimizations that would otherwise be tedious. + +For now, I believe having a wrapper benefits us. However, it seems plausible that the need for this wrapper could diminish over time. + +#### Metric Abstraction + +##### Metric Interface + +```go +// A Metric collects numerical observations. +type Metric interface { + // Name returns the name value of a Metric. + Name() string + + // Record makes an observation of the provided value for the given measure. + Record(value float64) + + // RecordInt makes an observation of the provided value for the measure. + RecordInt(value int64) + + // Increment records a value of 1 for the current measure. + // For Counters, this is equivalent to adding 1 to the current value. + // For Gauges, this is equivalent to setting the value to 1. + // For Histograms, this is equivalent to making an observation of value 1. + Increment() + + // Decrement records a value of -1 for the current measure. + // For Counters, this is equivalent to subtracting -1 to the current value. + // For Gauges, this is equivalent to setting the value to -1. + // For Histograms, this is equivalent to making an observation of value -1. + Decrement() + + // With creates a new Metric, with the LabelValues provided. + // This allows creating a set of pre-dimensioned data for recording purposes. + // This is primarily used for documentation and convenience. + // Metrics created with this method do not need to be registered (they share the registration of their parent Metric). + With(labelValues ...LabelValue) Metric +} +``` + +##### Metric Label + +```go +// NewLabel will attempt to create a new Label. +func NewLabel(key string) Label { + return Label{attribute.Key(key)} +} + +// A Label provides a named dimension for a Metric. +type Label struct { + key attribute.Key +} + +// Value creates a new LabelValue for the Label. +func (l Label) Value(value string) LabelValue { + return LabelValue{l.key.String(value)} +} + +// A LabelValue represents a Label with a specific value. It is used to record +// values for a Metric. +type LabelValue struct { + keyValue attribute.KeyValue +} + +``` + +##### Metric Definition + +```go +// Definition records a metric's metadata. +type Definition struct { + Name string + Type MetricType + Description string + Bounds []float64 +} + +// metrics stores known metrics +type def struct { + started bool + mu sync.Mutex + known map[string]Definition +} + +// defs is a global that stores all registered metrics +var defs = def{ + known: map[string]Definition{}, +} +``` + +#### Metric Implementation + +##### Base + +```go +// base metric implementation. +type base struct { + name string + attrs []attribute.KeyValue + m Metric +} + +func (f base) Name() string { + return f.name +} + +func (f base) Increment() { + f.m.Record(1) +} + +func (f base) Decrement() { + f.m.Record(-1) +} + +func (f base) RecordInt(value int64) { + f.m.Record(float64(value)) +} +``` + +#### Disabled + +```go +// disabled metric implementation. +type disabled struct { + name string +} + +// Decrement implements Metric +func (dm *disabled) Decrement() {} + +// Increment implements Metric +func (dm *disabled) Increment() {} + +// Name implements Metric +func (dm *disabled) Name() string { + return dm.name +} + +// Record implements Metric +func (dm *disabled) Record(value float64) {} + +// RecordInt implements Metric +func (dm *disabled) RecordInt(value int64) {} + +// With implements Metric +func (dm *disabled) With(labelValues ...LabelValue) Metric { + return dm +} +``` + +##### Counter + +```go +type counter struct { + base + c api.Float64Counter + // preRecordOptions is just a precomputation to avoid allocations on each record call + preRecordOptions []api.AddOption +} + +func (f *counter) Record(value float64) { + if f.preRecordOptions != nil { + f.c.Add(context.Background(), value, f.preRecordOptions...) + } else { + f.c.Add(context.Background(), value) + } +} + +func (f *counter) With(labelValues ...LabelValue) Metric { + attrs, set := mergeAttributes(f.base, labelValues) + m := &counter{ + c: f.c, + preRecordOptions: []api.AddOption{api.WithAttributeSet(set)}, + } + m.base = base{ + name: f.name, + attrs: attrs, + m: m, + } + return m +} + +``` + +##### Gauge + +```go +type gauge struct { + base + g api.Float64ObservableGauge + + mutex *sync.RWMutex + + // labelSets stores a map of attributes -> values, for gauges. + labelSets map[attribute.Set]*gaugeValues + current *gaugeValues +} + +type gaugeValues struct { + val float64 + opt []api.ObserveOption +} + +func (f *gauge) Record(value float64) { + f.mutex.Lock() + if f.current == nil { + f.current = &gaugeValues{} + f.labelSets[attribute.NewSet()] = f.current + } + f.current.val = value + f.mutex.Unlock() +} + +func (f *gauge) With(labelValues ...LabelValue) Metric { + attrs, set := mergeAttributes(f.base, labelValues) + m := &gauge{ + g: f.g, + mutex: f.mutex, + labelSets: f.labelSets, + } + if _, f := m.labelSets[set]; !f { + m.labelSets[set] = &gaugeValues{ + opt: []api.ObserveOption{api.WithAttributeSet(set)}, + } + } + m.current = m.labelSets[set] + m.base = base{ + name: f.name, + attrs: attrs, + m: m, + } + return m +} +``` + +##### Histogram + +```go +type histogram struct { + base + d api.Float64Histogram + // preRecordOptions is just a precomputation to avoid allocations on each record call + preRecordOptions []api.RecordOption +} + +func (f *histogram) Record(value float64) { + if f.preRecordOptions != nil { + f.d.Record(context.Background(), value, f.preRecordOptions...) + } else { + f.d.Record(context.Background(), value) + } +} + +func (f *histogram) With(labelValues ...LabelValue) Metric { + attrs, set := mergeAttributes(f.base, labelValues) + m := &histogram{ + d: f.d, + preRecordOptions: []api.RecordOption{api.WithAttributeSet(set)}, + } + m.base = base{ + name: f.name, + attrs: attrs, + m: m, + } + return m +} +``` + +#### Extensibility + +##### Register + +```go +// register sets the global metrics registry to the provided Prometheus registerer. +func register(opts metricsOptions) error { + otelOpts := []metric.Option{} + + if err := registerOTELPromExporter(&otelOpts, opts); err != nil { + return err + } + if err := registerOTELHTTPexporter(&otelOpts, opts); err != nil { + return err + } + if err := registerOTELgRPCexporter(&otelOpts, opts); err != nil { + return err + } + otelOpts = append(otelOpts, defs.preAddOptions()...) + + mp := metric.NewMeterProvider(otelOpts...) + otel.SetMeterProvider(mp) + + return nil +} +``` + +##### Exporter + +```go +// registerOTELPromExporter registers OTEL prometheus exporter (PULL mode). +func registerOTELPromExporter(otelOpts *[]metric.Option, opts metricsOptions) error { + if opts.pullOptions.enable { + promOpts := []otelprom.Option{ + otelprom.WithoutScopeInfo(), + otelprom.WithoutTargetInfo(), + otelprom.WithoutUnits(), + otelprom.WithRegisterer(opts.pullOptions.reg), + otelprom.WithoutCounterSuffixes(), + } + promreader, err := otelprom.New(promOpts...) + if err != nil { + return err + } + + *otelOpts = append(*otelOpts, metric.WithReader(promreader)) + monitoringLogger.Info("initialized metrics pull endpoint", "address", opts.pullOptions.address) + } + + return nil +} + +// registerOTELHTTPexporter registers OTEL HTTP metrics exporter (PUSH mode). +func registerOTELHTTPexporter(otelOpts *[]metric.Option, opts metricsOptions) error { + for _, sink := range opts.pushOptions.sinks { + address := fmt.Sprintf("%s:%d", sink.host, sink.port) + httpexporter, err := otlpmetrichttp.New( + context.Background(), + otlpmetrichttp.WithEndpoint(address), + otlpmetrichttp.WithInsecure(), + ) + if err != nil { + return err + } + + otelreader := metric.NewPeriodicReader(httpexporter) + *otelOpts = append(*otelOpts, metric.WithReader(otelreader)) + monitoringLogger.Info("initialized metrics push endpoint", "address", address) + } + + return nil +} + +// registerOTELgRPCexporter registers OTEL gRPC metrics exporter (PUSH mode). +func registerOTELgRPCexporter(otelOpts *[]metric.Option, opts metricsOptions) error { + for _, sink := range opts.pushOptions.sinks { + if sink.protocol == "grpc" { + address := fmt.Sprintf("%s:%d", sink.host, sink.port) + httpexporter, err := otlpmetricgrpc.New( + context.Background(), + otlpmetricgrpc.WithEndpoint(address), + otlpmetricgrpc.WithInsecure(), + ) + if err != nil { + return err + } + + otelreader := metric.NewPeriodicReader(httpexporter) + *otelOpts = append(*otelOpts, metric.WithReader(otelreader)) + monitoringLogger.Info("initialized otel grpc metrics push endpoint", "address", address) + } + } + + return nil +} +``` + +#### How to Use? + +> Let me take metrics instrumentation in watchable message queue as an example + +1. create metrics in target pkg: + +```go + // metrics definitions + watchableHandleUpdates = metrics.NewCounter( + "watchable_queue_handle_updates_total", + "Total number of updates handled by watchable queue.", + ) + + watchableHandleUpdateErrors = metrics.NewCounter( + "watchable_queue_handle_updates_errors_total", + "Total number of update errors handled by watchable queue.", + ) + + watchableDepth = metrics.NewGauge( + "watchable_queue_depth", + "Current depth of watchable message queue.", + ) + + watchableHandleUpdateTimeSeconds = metrics.NewHistogram( + "watchable_queue_handle_update_time_seconds", + "How long in seconds a update handled by watchable queue.", + []float64{0.001, 0.01, 0.1, 1, 5, 10}, + ) + +``` + +2. create metric labels if needed in target pkg: + +```go + // metrics label definitions + // component is which component the update belong to. + componentNameLabel = metrics.NewLabel("component_name") + // resource is which resource the update belong to. + resourceTypeLabel = metrics.NewLabel("resource_type") +``` + +3. record the metric value with label in target pkg: + +```go +type Update[K comparable, V any] watchable.Update[K, V] + +var logger = logging.DefaultLogger(v1alpha1.LogLevelInfo).WithName("watchable") + +type UpdateMetadata struct { + Component string + Resource string +} + +func (m UpdateMetadata) LabelValues() []metrics.LabelValue { + labels := []metrics.LabelValue{} + if m.Component != "" { + labels = append(labels, componentNameLabel.Value(m.Component)) + } + if m.Resource != "" { + labels = append(labels, resourceTypeLabel.Value(m.Resource)) + } + + return labels +} + +// HandleSubscription takes a channel returned by +// watchable.Map.Subscribe() (or .SubscribeSubset()), and calls the +// given function for each initial value in the map, and for any +// updates. +// +// This is better than simply iterating over snapshot.Updates because +// it handles the case where the watchable.Map already contains +// entries before .Subscribe is called. +func HandleSubscription[K comparable, V any]( + meta UpdateMetadata, + subscription <-chan watchable.Snapshot[K, V], + handle func(updateFunc Update[K, V], errChans chan error), +) { + errChans := make(chan error, 10) + go func() { + for err := range errChans { + logger.WithValues("component", meta.Component).Error(err, "observed an error") + watchableHandleUpdateErrors.With(meta.LabelValues()...).Increment() + } + }() + + if snapshot, ok := <-subscription; ok { + for k, v := range snapshot.State { + startHandleTime := time.Now() + handle(Update[K, V]{Key: k, Value: v}, errChans) + watchableHandleUpdates.With(meta.LabelValues()...).Increment() + watchableHandleUpdateTimeSeconds.With(meta.LabelValues()...).Record(time.Since(startHandleTime).Seconds()) + } + } + for snapshot := range subscription { + watchableDepth.With(meta.LabelValues()...).RecordInt(int64(len(subscription))) + for _, update := range snapshot.Updates { + startHandleTime := time.Now() + handle(Update[K, V](update), errChans) + watchableHandleUpdates.With(meta.LabelValues()...).Increment() + watchableHandleUpdateTimeSeconds.With(meta.LabelValues()...).Record(time.Since(startHandleTime).Seconds()) + } + } +} + +``` + +4. Build and Test + +Visit the `{adminPort}:/metrics` and you can see the new added metrics: + +![metrics](/img/metrics-demo-1.png) + +## Basic Metrics Instrumentation + +### Provider + +#### Label + +##### Label Name: component_name + +Scope: + +1. watchable_queue_handle_updates_total +2. watchable_queue_handle_updates_errors_total +3. watchable_queue_depth +4. watchable_queue_handle_update_time_seconds + +Supported values: + ++ provider + +##### Label Name: resource_type + +Scope: + +1. watchable_queue_handle_updates_total +2. watchable_queue_handle_updates_errors_total +3. watchable_queue_depth +4. watchable_queue_handle_update_time_seconds + +Supported values: + ++ httproute-status ++ tcproute-status ++ udproute-status ++ tlsroute-status ++ envoypatchpolicy-status ++ ... + +#### Counter + ++ watchable_queue_handle_updates_total: Total number of updates handled by watchable queue. ++ watchable_queue_handle_updates_errors_total: Total number of update errors handled by watchable queue. + +--- + +> controller-runtime metrics + ++ certwatcher_read_certificate_total: Total number of certificate reads ++ certwatcher_read_certificate_errors_total: Total number of certificate read errors ++ controller_runtime_reconcile_errors_total: Total number of reconciliation errors per controller ++ controller_runtime_reconcile_total: Total number of reconciliations per controller ++ rest_client_requests_total: Number of HTTP requests, partitioned by status code, method, and host. ++ workqueue_adds_total: Total number of adds handled by workqueue ++ workqueue_retries_total: Total number of retries handled by workqueue + +#### Gauge + ++ watchable_queue_depth: Current depth of watchable message queue. + +--- + +> controller-runtime metrics + ++ controller_runtime_active_workers: Number of currently used workers per controller ++ controller_runtime_max_concurrent_reconciles: Maximum number of concurrent reconciles per controller ++ workqueue_depth: Current depth of workqueue + +#### Histogram + ++ watchable_queue_handle_update_time_seconds: How long in seconds a update handled by watchable queue. + +--- + +> controller-runtime metrics + ++ controller_runtime_reconcile_time_seconds: Length of time per reconciliation per controller ++ workqueue_longest_running_processor_seconds: How many seconds has the longest running processor for workqueue been running. ++ workqueue_queue_duration_seconds: How long in seconds an item stays in workqueue before being requested ++ workqueue_unfinished_work_seconds: How many seconds of work has been done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases. ++ workqueue_work_duration_seconds: How long in seconds processing an item from workqueue takes. + +### Resource Translator + +#### Label + +##### Label Name: component_name + +Scope: + +1. watchable_queue_handle_updates_total +2. watchable_queue_handle_updates_errors_total +3. watchable_queue_depth +4. watchable_queue_handle_update_time_seconds + +Supported values: + ++ gateway-api + +#### Counter + ++ watchable_queue_handle_updates_total: Total number of updates handled by watchable queue. ++ watchable_queue_handle_updates_errors_total: Total number of update errors handled by watchable queue. + +#### Gauge + ++ watchable_queue_depth: Current depth of watchable message queue. + +#### Histogram + ++ watchable_queue_handle_update_time_seconds: How long in seconds a update handled by watchable queue. + +### Infra Manager + +#### Label + +##### Label Name: component_name + +Scope: + +1. watchable_queue_handle_updates_total +2. watchable_queue_handle_updates_errors_total +3. watchable_queue_depth +4. watchable_queue_handle_update_time_seconds + +Supported values: + ++ infrastructure + +##### Label Name: operation + +Scope: + +1. infra_manager_resources_errors_total + +Supported values: + ++ created ++ updated ++ deleted + +##### Label Name: k8s_resource_type + +Scope: + +1. infra_manager_resources_created_total +2. infra_manager_resources_updated_total +3. infra_manager_resources_deleted_total +4. infra_manager_resources_errors_total + +Supported values: + ++ Deployment ++ Service ++ ServiceAccount ++ ConfigMap + +##### Label Name: k8s_resource_name + +Scope: + +1. infra_manager_resources_created_total +2. infra_manager_resources_updated_total +3. infra_manager_resources_deleted_total +4. infra_manager_resources_errors_total + +Supported values: + +resource name + +##### Label Name: k8s_resource_namespace + +Scope: + +1. infra_manager_resources_created_total +2. infra_manager_resources_updated_total +3. infra_manager_resources_deleted_total +4. infra_manager_resources_errors_total + +Supported values: + +resource namespace + +#### Counter + ++ watchable_queue_handle_updates_total: Total number of updates handled by watchable queue. ++ watchable_queue_handle_updates_errors_total: Total number of update errors handled by watchable queue. + ++ infra_manager_resources_created_total: Total number of the resources created by infra manager. ++ infra_manager_resources_updated_total: Total number of the resources updated by infra manager. ++ infra_manager_resources_deleted_total: Total number of the resources deleted by infra manager. + ++ infra_manager_resources_errors_total: Total number of the resources errors encountered by infra manager. + +#### Gauge + ++ watchable_queue_depth: Current depth of watchable message queue. + +#### Histogram + ++ watchable_queue_handle_update_time_seconds: How long in seconds a update handled by watchable queue. + +### xDS Translator + +#### Label + +##### Label Name: component_name + +Scope: + +1. watchable_queue_handle_updates_total +2. watchable_queue_handle_updates_errors_total +3. watchable_queue_depth +4. watchable_queue_handle_update_time_seconds + +Supported values: + ++ xds-translator + +#### Counter + ++ watchable_queue_handle_updates_total: Total number of updates handled by watchable queue. ++ watchable_queue_handle_updates_errors_total: Total number of update errors handled by watchable queue. + +#### Gauge + ++ watchable_queue_depth: Current depth of watchable message queue. + +#### Histogram + ++ watchable_queue_handle_update_time_seconds: How long in seconds a update handled by watchable queue. + +### Extension Manager + +#### Label + +##### Label Name: target + +Scope: + +1. extension_manager_post_hook_calls_total +2. extension_manager_post_hook_call_errors_total +3. extension_manager_post_hook_time_seconds + +Supported values: + ++ route ++ virtualHost ++ listener ++ cluster + +#### Counter + ++ extension_manager_post_hook_calls_total: Total number of the post hook calls in extension manager. ++ extension_manager_post_hook_call_errors_total: Total number of the post hook call errors in extension manager. + +#### Histogram + ++ extension_manager_post_hook_time_seconds: How long in seconds a post hook called in extension manager. + +## Envoy Gateway API Types + +New APIs will be added to Envoy Gateway config, which are used to manage Control Plane Telemetry bootstrap configs. + +### EnvoyGatewayTelemetry + +```go +// EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. +// Control plane will focus on metrics observability telemetry and tracing telemetry later. +type EnvoyGatewayTelemetry struct { + // Metrics defines metrics configuration for envoy gateway. + Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` +} +``` + +### EnvoyGatewayMetrics + +```go +// EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +type EnvoyGatewayMetrics struct { + // Sinks defines the metric sinks where metrics are sent to. + Sinks []EnvoyGatewayMetricSink `json:"sinks,omitempty"` + // Prometheus defines the configuration for prometheus endpoint. + Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` +} + +// EnvoyGatewayMetricSink defines control plane +// metric sinks where metrics are sent to. +type EnvoyGatewayMetricSink struct { + // Type defines the metric sink type. + // EG control plane currently supports OpenTelemetry. + // +kubebuilder:validation:Enum=OpenTelemetry + // +kubebuilder:default=OpenTelemetry + Type MetricSinkType `json:"type"` + // Host define the sink service hostname. + Host string `json:"host"` + // Port defines the port the sink service is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=4318 + Port int32 `json:"port,omitempty"` +} + +// EnvoyGatewayPrometheusProvider will expose prometheus endpoint +// `/stats/prometheus` and reuse Envoy Gateway admin port. +type EnvoyGatewayPrometheusProvider struct { + // Enable defines if enables the prometheus metrics in pull mode. Default is true. + // + // +optional + // +kubebuilder:default=true + Enable bool `json:"enable,omitempty"` +} +``` + +#### Example + ++ The following is an example to enable prometheus metric. + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyGateway +gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +logging: + level: + default: info +provider: + type: Kubernetes +telemetry: + metrics: + prometheus: + enable: true +``` + ++ The following is an example to send metric via Open Telemetry sink. + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyGateway +gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +logging: + level: + default: info +provider: + type: Kubernetes +telemetry: + metrics: + sinks: + - type: OpenTelemetry + host: otel-collector.monitoring.svc.cluster.local + port: 4318 +``` + ++ The following is an example to enable prometheus metric and send metric via Open Telemetry sink at the same time. + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyGateway +gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +logging: + level: + default: info +provider: + type: Kubernetes +telemetry: + metrics: + prometheus: + enable: true + sinks: + - type: OpenTelemetry + host: otel-collector.monitoring.svc.cluster.local + port: 4318 +``` diff --git a/site/content/en/latest/design/pprof.md b/site/content/en/latest/design/pprof.md index c535b4800710..b8777c33d114 100644 --- a/site/content/en/latest/design/pprof.md +++ b/site/content/en/latest/design/pprof.md @@ -1,15 +1,19 @@ --- -title: "Add Pprof support in Envoy Gateway" +title: "Debug support in Envoy Gateway" --- ## Overview -Envoy Gateway exposes endpoints at `localhost:8899/debug/pprof` to run Golang profiles to aid in live debugging. The endpoints are equivalent to those found in the http/pprof package. `/debug/pprof/` returns an HTML page listing the available profiles. +Envoy Gateway exposes endpoints at `localhost:19010/debug/pprof` to run Golang profiles to aid in live debugging. + +The endpoints are equivalent to those found in the http/pprof package. `/debug/pprof/` returns an HTML page listing the available profiles. ## Goals -* Add Debug Pprof support to Envoy Gateway control plane. +* Add debug server to Envoy Gateway control plane, separated with admin server. +* Add pprof support to Envoy Gateway control plane. * Define an API to allow Envoy Gateway to custom debug server configuration. +* Define an API to allow Envoy Gateway to open envoy gateway config dump in logs. The following are the different types of profiles end-user can run: @@ -30,11 +34,12 @@ PROFILE | FUNCTION ## API -* Add `admin` field in EnvoyGateway config. -* Add `debug` field under `admin` field. -* Add `enable`, `port` and `host` under `address` field. +* Add `debug` field in EnvoyGateway config. +* Add `address` field under `debug` field. +* Add `port` and `host` under `address` field. +* Add `dumpConfig` field under `debug field. -Here is an example configuration +Here is an example configuration to open debug server: ``` yaml apiVersion: gateway.envoyproxy.io/v1alpha1 @@ -43,9 +48,21 @@ gateway: kind: EnvoyGateway provider: type: "Kubernetes" -admin: - debug: true +debug: address: - port: 8899 - host: "127.0.0.1" + host: 127.0.0.1 + port: 19010 +``` + +Here is an example configuration to open envoy gateway config dump in logs: + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +gateway: + controllerName: "gateway.envoyproxy.io/gatewayclass-controller" +kind: EnvoyGateway +provider: + type: "Kubernetes" +debug: + dumpConfig: true ``` diff --git a/site/static/img/metrics-demo-1.png b/site/static/img/metrics-demo-1.png new file mode 100644 index 000000000000..ff2c5fed331a Binary files /dev/null and b/site/static/img/metrics-demo-1.png differ