From a9c5b6f7c8c54c6097a887c681e575e81430e3b1 Mon Sep 17 00:00:00 2001 From: Dani Louca Date: Mon, 27 Mar 2023 11:57:34 -0400 Subject: [PATCH] Unset HOST_PROC to make sure we use and report the actual process and introduce an option to allow user to overwrite the unset logic and use the value set in the environment variable Signed-off-by: Dani Louca --- .chloggen/metric-server-hostproc.yaml | 16 ++++++ .../proctelemetry/process_telemetry.go | 11 +++- .../proctelemetry/process_telemetry_test.go | 51 +++++++++++++++++-- service/service.go | 2 +- service/telemetry/config.go | 12 +++++ service/telemetry/config_test.go | 36 +++++++++++-- 6 files changed, 120 insertions(+), 8 deletions(-) create mode 100755 .chloggen/metric-server-hostproc.yaml diff --git a/.chloggen/metric-server-hostproc.yaml b/.chloggen/metric-server-hostproc.yaml new file mode 100755 index 00000000000..6de0fd1653a --- /dev/null +++ b/.chloggen/metric-server-hostproc.yaml @@ -0,0 +1,16 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver) +component: service/proctelemetry + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Make sure OTEL internal metrics is reading and reporting its own process data when HOST_PROC is set set + +# One or more tracking issues or pull requests related to the change +issues: [] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: If user wants to use the path defined in the environment variable HOST_PROC, then they can set `useHostProcEnvVar` to true diff --git a/service/internal/proctelemetry/process_telemetry.go b/service/internal/proctelemetry/process_telemetry.go index 58bee722167..bfc0cc8c6df 100644 --- a/service/internal/proctelemetry/process_telemetry.go +++ b/service/internal/proctelemetry/process_telemetry.go @@ -63,7 +63,7 @@ type processMetrics struct { // RegisterProcessMetrics creates a new set of processMetrics (mem, cpu) that can be used to measure // basic information about this process. -func RegisterProcessMetrics(ocRegistry *metric.Registry, mp otelmetric.MeterProvider, useOtel bool, ballastSizeBytes uint64) error { +func RegisterProcessMetrics(ocRegistry *metric.Registry, mp otelmetric.MeterProvider, useOtel bool, ballastSizeBytes uint64, useHostProcEnvVar bool) error { var err error pm := &processMetrics{ startTimeUnixNano: time.Now().UnixNano(), @@ -71,10 +71,19 @@ func RegisterProcessMetrics(ocRegistry *metric.Registry, mp otelmetric.MeterProv ms: &runtime.MemStats{}, } + hostproc, exists := os.LookupEnv("HOST_PROC") + // unset HOST_PROC env variable so the process search occurs locally + if !useHostProcEnvVar && exists { + os.Unsetenv("HOST_PROC") + } pm.proc, err = process.NewProcess(int32(os.Getpid())) if err != nil { return err } + // restore immediately if it was previously set, don't use defer + if !useHostProcEnvVar && exists { + os.Setenv("HOST_PROC", hostproc) + } if useOtel { return pm.recordWithOtel(mp.Meter(scopeName)) diff --git a/service/internal/proctelemetry/process_telemetry_test.go b/service/internal/proctelemetry/process_telemetry_test.go index 5ac3e1b0d90..19bd4e2b7cf 100644 --- a/service/internal/proctelemetry/process_telemetry_test.go +++ b/service/internal/proctelemetry/process_telemetry_test.go @@ -18,6 +18,7 @@ import ( "context" "net/http" "net/http/httptest" + "os" "strings" "testing" "time" @@ -116,7 +117,7 @@ func fetchPrometheusMetrics(handler http.Handler) (map[string]*io_prometheus_cli func TestOtelProcessTelemetry(t *testing.T) { tel := setupTelemetry(t) - require.NoError(t, RegisterProcessMetrics(nil, tel.MeterProvider, true, 0)) + require.NoError(t, RegisterProcessMetrics(nil, tel.MeterProvider, true, 0, false)) mp, err := fetchPrometheusMetrics(tel.promHandler) require.NoError(t, err) @@ -148,7 +149,51 @@ func TestOtelProcessTelemetry(t *testing.T) { func TestOCProcessTelemetry(t *testing.T) { ocRegistry := metric.NewRegistry() - require.NoError(t, RegisterProcessMetrics(ocRegistry, otelmetric.NewNoopMeterProvider(), false, 0)) + require.NoError(t, RegisterProcessMetrics(ocRegistry, otelmetric.NewNoopMeterProvider(), false, 0, false)) + + // Check that the metrics are actually filled. + <-time.After(200 * time.Millisecond) + + metrics := ocRegistry.Read() + + for _, metricName := range expectedMetrics { + m := findMetric(metrics, metricName) + require.NotNil(t, m) + require.Len(t, m.TimeSeries, 1) + ts := m.TimeSeries[0] + assert.Len(t, ts.LabelValues, 0) + require.Len(t, ts.Points, 1) + + var value float64 + if metricName == "process/uptime" || metricName == "process/cpu_seconds" { + value = ts.Points[0].Value.(float64) + } else { + value = float64(ts.Points[0].Value.(int64)) + } + + if metricName == "process/uptime" || metricName == "process/cpu_seconds" { + // This likely will still be zero when running the test. + assert.GreaterOrEqual(t, value, float64(0), metricName) + continue + } + + assert.Greater(t, value, float64(0), metricName) + } +} + +func TestOCProcessTelemetryWithHostProc(t *testing.T) { + ocRegistry := metric.NewRegistry() + hostProc := "/host/proc" + os.Setenv("HOST_PROC", hostProc) + + require.NoError(t, RegisterProcessMetrics(ocRegistry, otelmetric.NewNoopMeterProvider(), false, 0, false)) + + // make sure HOST_PROC is restored + envValue, _ := os.LookupEnv("HOST_PROC") + require.Equal(t, envValue, hostProc) + + // unset HOST_PROC so other tests are not impacted + defer os.Unsetenv("HOST_PROC") // Check that the metrics are actually filled. <-time.After(200 * time.Millisecond) @@ -186,7 +231,7 @@ func TestProcessTelemetryFailToRegister(t *testing.T) { ocRegistry := metric.NewRegistry() _, err := ocRegistry.AddFloat64Gauge(metricName) require.NoError(t, err) - assert.Error(t, RegisterProcessMetrics(ocRegistry, otelmetric.NewNoopMeterProvider(), false, 0)) + assert.Error(t, RegisterProcessMetrics(ocRegistry, otelmetric.NewNoopMeterProvider(), false, 0, false)) }) } } diff --git a/service/service.go b/service/service.go index e3f3f2999ca..79a4a033d67 100644 --- a/service/service.go +++ b/service/service.go @@ -214,7 +214,7 @@ func (srv *Service) initExtensionsAndPipeline(ctx context.Context, set Settings, if cfg.Telemetry.Metrics.Level != configtelemetry.LevelNone && cfg.Telemetry.Metrics.Address != "" { // The process telemetry initialization requires the ballast size, which is available after the extensions are initialized. - if err = proctelemetry.RegisterProcessMetrics(srv.telemetryInitializer.ocRegistry, srv.telemetryInitializer.mp, obsreportconfig.UseOtelForInternalMetricsfeatureGate.IsEnabled(), getBallastSize(srv.host)); err != nil { + if err = proctelemetry.RegisterProcessMetrics(srv.telemetryInitializer.ocRegistry, srv.telemetryInitializer.mp, obsreportconfig.UseOtelForInternalMetricsfeatureGate.IsEnabled(), getBallastSize(srv.host), cfg.Telemetry.Metrics.UseHostProcEnvVar); err != nil { return fmt.Errorf("failed to register process metrics: %w", err) } } diff --git a/service/telemetry/config.go b/service/telemetry/config.go index b508e4727fc..9856fd1720c 100644 --- a/service/telemetry/config.go +++ b/service/telemetry/config.go @@ -16,6 +16,7 @@ package telemetry // import "go.opentelemetry.io/collector/service/telemetry" import ( "fmt" + "os" "go.uber.org/zap/zapcore" @@ -116,6 +117,10 @@ type MetricsConfig struct { // Address is the [address]:port that metrics exposition should be bound to. Address string `mapstructure:"address"` + + // UseHostProcEnvVar when set to true, the metric server, through gopsutil, + // lookup for the otel process in the proc path defined in the HOST_PROC environment variable. + UseHostProcEnvVar bool `mapstructure:"useHostProcEnvVar"` } // TracesConfig exposes the common Telemetry configuration for collector's internal spans. @@ -134,6 +139,13 @@ func (c *Config) Validate() error { if c.Metrics.Level != configtelemetry.LevelNone && c.Metrics.Address == "" { return fmt.Errorf("collector telemetry metric address should exist when metric level is not none") } + // Validate that the hostproc env variable is set when metric server is enabled + if c.Metrics.Level != configtelemetry.LevelNone && c.Metrics.Address != "" && c.Metrics.UseHostProcEnvVar { + if _, exists := os.LookupEnv("HOST_PROC"); !exists { + return fmt.Errorf("collector telemetry metric UueHostProcEnvVar " + + "is set to true, but HOST_PROC env variavle is not set") + } + } return nil } diff --git a/service/telemetry/config_test.go b/service/telemetry/config_test.go index 4dddda15264..3b9236ca762 100644 --- a/service/telemetry/config_test.go +++ b/service/telemetry/config_test.go @@ -15,6 +15,7 @@ package telemetry import ( + "os" "testing" "github.com/stretchr/testify/assert" @@ -24,9 +25,10 @@ import ( func TestLoadConfig(t *testing.T) { tests := []struct { - name string - cfg *Config - success bool + name string + cfg *Config + success bool + hostproc string }{ { name: "basic metric telemetry", @@ -48,10 +50,38 @@ func TestLoadConfig(t *testing.T) { }, success: false, }, + { + name: "metric telemetry with valid useHostProcEnvVar", + cfg: &Config{ + Metrics: MetricsConfig{ + Level: configtelemetry.LevelBasic, + Address: "127.0.0.1:3333", + UseHostProcEnvVar: true, + }, + }, + success: true, + hostproc: "/host/proc", + }, + { + name: "metric telemetry with useHostProcEnvVar but actual env variable not set", + cfg: &Config{ + Metrics: MetricsConfig{ + Level: configtelemetry.LevelBasic, + Address: "127.0.0.1:3333", + UseHostProcEnvVar: true, + }, + }, + success: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + if tt.hostproc != "" { + os.Setenv("HOST_PROC", tt.hostproc) + } else { + os.Unsetenv("HOST_PROC") + } err := tt.cfg.Validate() if tt.success { assert.NoError(t, err)