diff --git a/docs/monitoring.md b/docs/monitoring.md index 261e7b06a5..1a47141a7c 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -91,7 +91,7 @@ NGINX Kubernetes Gateway exports the following metrics: - nginx_reload_errors_total. Number of unsuccessful NGINX reloads. - nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means NGINX is running with a stale version. - - nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads. + - nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads (histogram). - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`. diff --git a/internal/mode/static/manager.go b/internal/mode/static/manager.go index 3876a40823..e84bc1404f 100644 --- a/internal/mode/static/manager.go +++ b/internal/mode/static/manager.go @@ -130,13 +130,9 @@ func StartManager(cfg config.Config) error { return fmt.Errorf("NGINX is not running: %w", err) } - var mgrCollector ngxruntime.ManagerCollector - mgrCollector = nkgmetrics.NewManagerFakeCollector() - if cfg.MetricsConfig.Enabled { - mgrCollector, err = configureNginxMetrics(cfg.GatewayClassName) - if err != nil { - return err - } + mgrCollector, err := createAndRegisterMetricsCollectors(cfg.MetricsConfig.Enabled, cfg.GatewayClassName) + if err != nil { + return fmt.Errorf("cannot create and register metrics collectors: %w", err) } statusUpdater := status.NewUpdater(status.UpdaterConfig{ @@ -356,19 +352,26 @@ func setInitialConfig( return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter) } -func configureNginxMetrics(gatewayClassName string) (*nkgmetrics.ManagerMetricsCollector, error) { - constLabels := map[string]string{"class": gatewayClassName} +// createAndRegisterMetricsCollectors creates the NGINX status and NGINX runtime manager collectors, registers them, +// and returns the runtime manager collector to be used in the nginxRuntimeMgr. +func createAndRegisterMetricsCollectors(metricsEnabled bool, gwClassName string) (ngxruntime.ManagerCollector, error) { + if !metricsEnabled { + // return a no-op collector to avoid nil pointer errors when metrics are disabled + return nkgmetrics.NewManagerNoopCollector(), nil + } + constLabels := map[string]string{"class": gwClassName} + ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels) if err != nil { - return nil, fmt.Errorf("cannot get NGINX metrics: %w", err) + return nil, fmt.Errorf("cannot create NGINX status metrics collector: %w", err) } - mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels) - if err = metrics.Registry.Register(mgrCollector); err != nil { - return nil, fmt.Errorf("failed to register manager metrics collector: %w", err) + if err := metrics.Registry.Register(ngxCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX status metrics collector: %w", err) } - if err = metrics.Registry.Register(ngxCollector); err != nil { - return nil, fmt.Errorf("failed to register NGINX metrics collector: %w", err) + mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels) + if err := metrics.Registry.Register(mgrCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX manager runtime metrics collector: %w", err) } return mgrCollector, nil diff --git a/internal/mode/static/metrics/collector.go b/internal/mode/static/metrics/collector.go index 6ecd171d3b..daa454ed85 100644 --- a/internal/mode/static/metrics/collector.go +++ b/internal/mode/static/metrics/collector.go @@ -47,7 +47,7 @@ func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCo Namespace: metricsNamespace, Help: "Duration in milliseconds of NGINX reloads", ConstLabels: constLabels, - Buckets: []float64{100.0, 200.0, 300.0, 400.0, 500.0}, + Buckets: []float64{500, 1000, 5000, 10000, 30000}, }, ), } @@ -55,13 +55,13 @@ func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCo } // IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false. -func (mc *ManagerMetricsCollector) IncNginxReloadCount() { +func (mc *ManagerMetricsCollector) IncReloadCount() { mc.reloadsTotal.Inc() mc.updateConfigStaleStatus(false) } // IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true. -func (mc *ManagerMetricsCollector) IncNginxReloadErrors() { +func (mc *ManagerMetricsCollector) IncReloadErrors() { mc.reloadsError.Inc() mc.updateConfigStaleStatus(true) } @@ -75,8 +75,8 @@ func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) { mc.configStale.Set(status) } -// UpdateLastReloadTime updates the last NGINX reload time. -func (mc *ManagerMetricsCollector) UpdateLastReloadTime(duration time.Duration) { +// ObserveLastReloadTime adds the last NGINX reload time to the histogram. +func (mc *ManagerMetricsCollector) ObserveLastReloadTime(duration time.Duration) { mc.reloadsDuration.Observe(float64(duration / time.Millisecond)) } @@ -96,20 +96,20 @@ func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) { mc.reloadsDuration.Collect(ch) } -// ManagerFakeCollector is a fake collector that will implement ManagerCollector interface. +// ManagerNoopCollector is a no-op collector that will implement ManagerCollector interface. // Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors. -type ManagerFakeCollector struct{} +type ManagerNoopCollector struct{} -// NewManagerFakeCollector creates a fake collector that implements ManagerCollector interface. -func NewManagerFakeCollector() *ManagerFakeCollector { - return &ManagerFakeCollector{} +// NewManagerNoopCollector creates a no-op collector that implements ManagerCollector interface. +func NewManagerNoopCollector() *ManagerNoopCollector { + return &ManagerNoopCollector{} } -// IncNginxReloadCount implements a fake IncNginxReloadCount. -func (mc *ManagerFakeCollector) IncNginxReloadCount() {} +// IncReloadCount implements a no-op IncReloadCount. +func (mc *ManagerNoopCollector) IncReloadCount() {} -// IncNginxReloadErrors implements a fake IncNginxReloadErrors. -func (mc *ManagerFakeCollector) IncNginxReloadErrors() {} +// IncReloadErrors implements a no-op IncReloadErrors. +func (mc *ManagerNoopCollector) IncReloadErrors() {} -// UpdateLastReloadTime implements a fake UpdateLastReloadTime. -func (mc *ManagerFakeCollector) UpdateLastReloadTime(_ time.Duration) {} +// ObserveLastReloadTime implements a no-op ObserveLastReloadTime. +func (mc *ManagerNoopCollector) ObserveLastReloadTime(_ time.Duration) {} diff --git a/internal/mode/static/nginx/runtime/manager.go b/internal/mode/static/nginx/runtime/manager.go index 377c9e3f86..4e64fa56f1 100644 --- a/internal/mode/static/nginx/runtime/manager.go +++ b/internal/mode/static/nginx/runtime/manager.go @@ -43,27 +43,27 @@ type Manager interface { // ManagerCollector is an interface for the metrics of the NGINX runtime manager. type ManagerCollector interface { - IncNginxReloadCount() - IncNginxReloadErrors() - UpdateLastReloadTime(ms time.Duration) + IncReloadCount() + IncReloadErrors() + ObserveLastReloadTime(ms time.Duration) } // ManagerImpl implements Manager. type ManagerImpl struct { verifyClient *verifyClient - metricsCollector ManagerCollector + managerCollector ManagerCollector } // NewManagerImpl creates a new ManagerImpl. -func NewManagerImpl(mgrCollector ManagerCollector) *ManagerImpl { +func NewManagerImpl(managerCollector ManagerCollector) *ManagerImpl { return &ManagerImpl{ verifyClient: newVerifyClient(nginxReloadTimeout), - metricsCollector: mgrCollector, + managerCollector: managerCollector, } } func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { - t1 := time.Now() + start := time.Now() // We find the main NGINX PID on every reload because it will change if the NGINX container is restarted. pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout) if err != nil { @@ -79,7 +79,7 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { // send HUP signal to the NGINX main process reload configuration // See https://nginx.org/en/docs/control.html if err := syscall.Kill(pid, syscall.SIGHUP); err != nil { - m.metricsCollector.IncNginxReloadErrors() + m.managerCollector.IncReloadErrors() return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err) } @@ -90,18 +90,18 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { os.ReadFile, childProcsTimeout, ); err != nil { - m.metricsCollector.IncNginxReloadErrors() + m.managerCollector.IncReloadErrors() return fmt.Errorf(noNewWorkersErrFmt, configVersion, err) } if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil { - m.metricsCollector.IncNginxReloadErrors() + m.managerCollector.IncReloadErrors() return err } - m.metricsCollector.IncNginxReloadCount() + m.managerCollector.IncReloadCount() - t2 := time.Now() - m.metricsCollector.UpdateLastReloadTime(t2.Sub(t1)) + finish := time.Now() + m.managerCollector.ObserveLastReloadTime(finish.Sub(start)) return nil }