Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NGINX reload counters #1049

Merged
merged 3 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ NGINX Kubernetes Gateway exports the following metrics:
- These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the
Gateway class of NKG. For example, `nginx_kubernetes_gateway_connections_accepted{class="nginx"}`.

- NGINX Kubernetes Gateway metrics:
- nginx_reloads_total. Number of successful NGINX reloads.
- nginx_reload_errors_total. Number of unsuccessful NGINX reloads.
- nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means
NGINX is running with a stale version.
- nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads (histogram).
- These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the
Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`.

- [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) metrics. These include:
- Total number of reconciliation errors per controller
- Length of reconcile queue per controller
Expand Down
47 changes: 31 additions & 16 deletions internal/mode/static/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ func StartManager(cfg config.Config) error {
return fmt.Errorf("cannot clear NGINX configuration folders: %w", err)
}

// Ensure NGINX is running before registering metrics & starting the manager.
if err := ngxruntime.EnsureNginxRunning(ctx); err != nil {
return fmt.Errorf("NGINX is not running: %w", err)
}

mgrCollector, err := createAndRegisterMetricsCollectors(cfg.MetricsConfig.Enabled, cfg.GatewayClassName)
if err != nil {
return fmt.Errorf("cannot create and register metrics collectors: %w", err)
}

statusUpdater := status.NewUpdater(status.UpdaterConfig{
GatewayCtlrName: cfg.GatewayCtlrName,
GatewayClassName: cfg.GatewayClassName,
Expand All @@ -146,7 +156,7 @@ func StartManager(cfg config.Config) error {
cfg.Logger.WithName("nginxFileManager"),
file.NewStdLibOSFileManager(),
),
nginxRuntimeMgr: ngxruntime.NewManagerImpl(),
nginxRuntimeMgr: ngxruntime.NewManagerImpl(mgrCollector),
statusUpdater: statusUpdater,
eventRecorder: recorder,
healthChecker: hc,
Expand Down Expand Up @@ -193,17 +203,6 @@ func StartManager(cfg config.Config) error {
}
}

// Ensure NGINX is running before registering metrics & starting the manager.
if err := ngxruntime.EnsureNginxRunning(ctx); err != nil {
return fmt.Errorf("NGINX is not running: %w", err)
}

if cfg.MetricsConfig.Enabled {
if err := configureNginxMetrics(cfg.GatewayClassName); err != nil {
return err
}
}

cfg.Logger.Info("Starting manager")
return mgr.Start(ctx)
}
Expand Down Expand Up @@ -353,13 +352,29 @@ func setInitialConfig(
return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter)
}

func configureNginxMetrics(gatewayClassName string) error {
constLabels := map[string]string{"class": gatewayClassName}
// createAndRegisterMetricsCollectors creates the NGINX status and NGINX runtime manager collectors, registers them,
// and returns the runtime manager collector to be used in the nginxRuntimeMgr.
func createAndRegisterMetricsCollectors(metricsEnabled bool, gwClassName string) (ngxruntime.ManagerCollector, error) {
if !metricsEnabled {
// return a no-op collector to avoid nil pointer errors when metrics are disabled
return nkgmetrics.NewManagerNoopCollector(), nil
}
constLabels := map[string]string{"class": gwClassName}

ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels)
if err != nil {
return fmt.Errorf("cannot get NGINX metrics: %w", err)
return nil, fmt.Errorf("cannot create NGINX status metrics collector: %w", err)
}
if err := metrics.Registry.Register(ngxCollector); err != nil {
return nil, fmt.Errorf("failed to register NGINX status metrics collector: %w", err)
}
return metrics.Registry.Register(ngxCollector)

mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels)
if err := metrics.Registry.Register(mgrCollector); err != nil {
return nil, fmt.Errorf("failed to register NGINX manager runtime metrics collector: %w", err)
}

return mgrCollector, nil
}

func getMetricsOptions(cfg config.MetricsConfig) metricsserver.Options {
Expand Down
115 changes: 115 additions & 0 deletions internal/mode/static/metrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package metrics

import (
"time"

"github.com/prometheus/client_golang/prometheus"
)

// ManagerMetricsCollector implements ManagerCollector interface and prometheus.Collector interface
type ManagerMetricsCollector struct {
// Metrics
reloadsTotal prometheus.Counter
reloadsError prometheus.Counter
configStale prometheus.Gauge
reloadsDuration prometheus.Histogram
}

// NewManagerMetricsCollector creates a new ManagerMetricsCollector
func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCollector {
nc := &ManagerMetricsCollector{
reloadsTotal: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "nginx_reloads_total",
Namespace: metricsNamespace,
Help: "Number of successful NGINX reloads",
ConstLabels: constLabels,
}),
reloadsError: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "nginx_reload_errors_total",
Namespace: metricsNamespace,
Help: "Number of unsuccessful NGINX reloads",
ConstLabels: constLabels,
},
),
configStale: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "nginx_stale_config",
Namespace: metricsNamespace,
Help: "Indicates if NGINX is not serving the latest configuration.",
ConstLabels: constLabels,
},
),
reloadsDuration: prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "nginx_reloads_milliseconds",
Namespace: metricsNamespace,
Help: "Duration in milliseconds of NGINX reloads",
ConstLabels: constLabels,
Buckets: []float64{500, 1000, 5000, 10000, 30000},
},
),
}
return nc
}

// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false.
func (mc *ManagerMetricsCollector) IncReloadCount() {
mc.reloadsTotal.Inc()
mc.updateConfigStaleStatus(false)
}

// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true.
func (mc *ManagerMetricsCollector) IncReloadErrors() {
mc.reloadsError.Inc()
mc.updateConfigStaleStatus(true)
}

// updateConfigStaleStatus updates the last NGINX reload status metric.
func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) {
var status float64
if stale {
status = 1.0
}
mc.configStale.Set(status)
}

// ObserveLastReloadTime adds the last NGINX reload time to the histogram.
func (mc *ManagerMetricsCollector) ObserveLastReloadTime(duration time.Duration) {
mc.reloadsDuration.Observe(float64(duration / time.Millisecond))
}

// Describe implements prometheus.Collector interface Describe method.
func (mc *ManagerMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
mc.reloadsTotal.Describe(ch)
mc.reloadsError.Describe(ch)
mc.configStale.Describe(ch)
mc.reloadsDuration.Describe(ch)
}

// Collect implements the prometheus.Collector interface Collect method.
func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) {
mc.reloadsTotal.Collect(ch)
mc.reloadsError.Collect(ch)
mc.configStale.Collect(ch)
mc.reloadsDuration.Collect(ch)
}

// ManagerNoopCollector is a no-op collector that will implement ManagerCollector interface.
// Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors.
type ManagerNoopCollector struct{}

// NewManagerNoopCollector creates a no-op collector that implements ManagerCollector interface.
func NewManagerNoopCollector() *ManagerNoopCollector {
return &ManagerNoopCollector{}
}

// IncReloadCount implements a no-op IncReloadCount.
func (mc *ManagerNoopCollector) IncReloadCount() {}

// IncReloadErrors implements a no-op IncReloadErrors.
func (mc *ManagerNoopCollector) IncReloadErrors() {}

// ObserveLastReloadTime implements a no-op ObserveLastReloadTime.
func (mc *ManagerNoopCollector) ObserveLastReloadTime(_ time.Duration) {}
4 changes: 4 additions & 0 deletions internal/mode/static/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package metrics

// nolint:gosec // flagged as potential hardcoded credentials, but is not sensitive
const metricsNamespace = "nginx_kubernetes_gateway"
2 changes: 1 addition & 1 deletion internal/mode/static/metrics/nginx.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func NewNginxMetricsCollector(constLabels map[string]string) (prometheus.Collect
if err != nil {
return nil, err
}
return nginxCollector.NewNginxCollector(client, "nginx_kubernetes_gateway", constLabels), nil
return nginxCollector.NewNginxCollector(client, metricsNamespace, constLabels), nil
}

// getSocketClient gets an http.Client with a unix socket transport.
Expand Down
28 changes: 24 additions & 4 deletions internal/mode/static/nginx/runtime/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,29 @@ type Manager interface {
Reload(ctx context.Context, configVersion int) error
}

// ManagerCollector is an interface for the metrics of the NGINX runtime manager.
type ManagerCollector interface {
IncReloadCount()
IncReloadErrors()
ObserveLastReloadTime(ms time.Duration)
}

// ManagerImpl implements Manager.
type ManagerImpl struct {
verifyClient *verifyClient
verifyClient *verifyClient
managerCollector ManagerCollector
}

// NewManagerImpl creates a new ManagerImpl.
func NewManagerImpl() *ManagerImpl {
func NewManagerImpl(managerCollector ManagerCollector) *ManagerImpl {
return &ManagerImpl{
verifyClient: newVerifyClient(nginxReloadTimeout),
verifyClient: newVerifyClient(nginxReloadTimeout),
managerCollector: managerCollector,
}
}

func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
ciarams87 marked this conversation as resolved.
Show resolved Hide resolved
start := time.Now()
// We find the main NGINX PID on every reload because it will change if the NGINX container is restarted.
pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout)
if err != nil {
Expand All @@ -69,6 +79,7 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
// send HUP signal to the NGINX main process reload configuration
// See https://nginx.org/en/docs/control.html
if err := syscall.Kill(pid, syscall.SIGHUP); err != nil {
m.managerCollector.IncReloadErrors()
return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err)
}

Expand All @@ -79,10 +90,19 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
os.ReadFile,
childProcsTimeout,
); err != nil {
m.managerCollector.IncReloadErrors()
return fmt.Errorf(noNewWorkersErrFmt, configVersion, err)
}

return m.verifyClient.waitForCorrectVersion(ctx, configVersion)
if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil {
m.managerCollector.IncReloadErrors()
return err
}
m.managerCollector.IncReloadCount()

finish := time.Now()
m.managerCollector.ObserveLastReloadTime(finish.Sub(start))
return nil
}

// EnsureNginxRunning ensures NGINX is running by locating the main process.
Expand Down