Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[service/proctelemetry] Offer to override the HOST_PROC environment variable with a programmatic value #7998

Merged
merged 14 commits into from
Aug 15, 2023
16 changes: 16 additions & 0 deletions .chloggen/metric-server-hostproc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix
atoulme marked this conversation as resolved.
Show resolved Hide resolved

# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver)
component: service/proctelemetry

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Make sure OTEL internal metrics is reading and reporting its own process data when HOST_PROC is set set
atoulme marked this conversation as resolved.
Show resolved Hide resolved

# One or more tracking issues or pull requests related to the change
issues: [7434]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: If user wants to use the path defined in the environment variable HOST_PROC, then they can set `useHostProcEnvVar` to true
15 changes: 11 additions & 4 deletions service/internal/proctelemetry/process_telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync"
"time"

"github.com/shirou/gopsutil/v3/common"
"github.com/shirou/gopsutil/v3/process"
"go.opencensus.io/metric"
"go.opencensus.io/stats"
Expand All @@ -27,6 +28,7 @@ type processMetrics struct {
startTimeUnixNano int64
ballastSizeBytes uint64
proc *process.Process
context context.Context

processUptime *metric.Float64DerivedCumulative
allocMem *metric.Int64DerivedGauge
Expand All @@ -51,15 +53,20 @@ type processMetrics struct {

// RegisterProcessMetrics creates a new set of processMetrics (mem, cpu) that can be used to measure
// basic information about this process.
func RegisterProcessMetrics(ocRegistry *metric.Registry, mp otelmetric.MeterProvider, useOtel bool, ballastSizeBytes uint64) error {
func RegisterProcessMetrics(ocRegistry *metric.Registry, mp otelmetric.MeterProvider, useOtel bool, ballastSizeBytes uint64, hostProc string) error {
var err error
pm := &processMetrics{
startTimeUnixNano: time.Now().UnixNano(),
ballastSizeBytes: ballastSizeBytes,
ms: &runtime.MemStats{},
}

pm.proc, err = process.NewProcess(int32(os.Getpid()))
ctx := context.Background()
if hostProc != "" {
atoulme marked this conversation as resolved.
Show resolved Hide resolved
ctx = context.WithValue(ctx, common.EnvKey, common.EnvMap{common.HostProcEnvKey: hostProc})
}
pm.context = ctx
pm.proc, err = process.NewProcessWithContext(pm.context, int32(os.Getpid()))
if err != nil {
return err
}
Expand Down Expand Up @@ -231,7 +238,7 @@ func (pm *processMetrics) updateSysMem() int64 {
}

func (pm *processMetrics) updateCPUSeconds() float64 {
times, err := pm.proc.Times()
times, err := pm.proc.TimesWithContext(pm.context)
if err != nil {
return 0
}
Expand All @@ -241,7 +248,7 @@ func (pm *processMetrics) updateCPUSeconds() float64 {
}

func (pm *processMetrics) updateRSSMemory() int64 {
mem, err := pm.proc.MemoryInfo()
mem, err := pm.proc.MemoryInfoWithContext(pm.context)
if err != nil {
return 0
}
Expand Down
53 changes: 53 additions & 0 deletions service/internal/proctelemetry/process_telemetry_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

//go:build linux
// +build linux

package proctelemetry

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opencensus.io/metric"
"go.opentelemetry.io/otel/metric/noop"
)

func TestOCProcessTelemetryWithHostProc(t *testing.T) {
ocRegistry := metric.NewRegistry()
// Make the sure the environment variable value is not used.
t.Setenv("HOST_PROC", "foo/bar")

require.NoError(t, RegisterProcessMetrics(ocRegistry, noop.NewMeterProvider(), false, 0, "/proc"))

// Check that the metrics are actually filled.
time.Sleep(200 * time.Millisecond)

metrics := ocRegistry.Read()

for _, metricName := range expectedMetrics {
m := findMetric(metrics, metricName)
require.NotNil(t, m)
require.Len(t, m.TimeSeries, 1)
ts := m.TimeSeries[0]
assert.Len(t, ts.LabelValues, 0)
require.Len(t, ts.Points, 1)

var value float64
if metricName == "process/uptime" || metricName == "process/cpu_seconds" {
value = ts.Points[0].Value.(float64)
} else {
value = float64(ts.Points[0].Value.(int64))
}

if metricName == "process/uptime" || metricName == "process/cpu_seconds" {
// This likely will still be zero when running the test.
assert.GreaterOrEqual(t, value, float64(0), metricName)
continue
}
assert.Greater(t, value, float64(0), metricName)
}
}
9 changes: 4 additions & 5 deletions service/internal/proctelemetry/process_telemetry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func fetchPrometheusMetrics(handler http.Handler) (map[string]*io_prometheus_cli
func TestOtelProcessTelemetry(t *testing.T) {
tel := setupTelemetry(t)

require.NoError(t, RegisterProcessMetrics(nil, tel.MeterProvider, true, 0))
require.NoError(t, RegisterProcessMetrics(nil, tel.MeterProvider, true, 0, ""))

mp, err := fetchPrometheusMetrics(tel.promHandler)
require.NoError(t, err)
Expand Down Expand Up @@ -137,10 +137,10 @@ func TestOtelProcessTelemetry(t *testing.T) {
func TestOCProcessTelemetry(t *testing.T) {
ocRegistry := metric.NewRegistry()

require.NoError(t, RegisterProcessMetrics(ocRegistry, noop.NewMeterProvider(), false, 0))
require.NoError(t, RegisterProcessMetrics(ocRegistry, noop.NewMeterProvider(), false, 0, ""))

// Check that the metrics are actually filled.
<-time.After(200 * time.Millisecond)
time.Sleep(200 * time.Millisecond)
atoulme marked this conversation as resolved.
Show resolved Hide resolved

metrics := ocRegistry.Read()

Expand All @@ -164,7 +164,6 @@ func TestOCProcessTelemetry(t *testing.T) {
assert.GreaterOrEqual(t, value, float64(0), metricName)
continue
}

assert.Greater(t, value, float64(0), metricName)
}
}
Expand All @@ -175,7 +174,7 @@ func TestProcessTelemetryFailToRegister(t *testing.T) {
ocRegistry := metric.NewRegistry()
_, err := ocRegistry.AddFloat64Gauge(metricName)
require.NoError(t, err)
assert.Error(t, RegisterProcessMetrics(ocRegistry, noop.NewMeterProvider(), false, 0))
assert.Error(t, RegisterProcessMetrics(ocRegistry, noop.NewMeterProvider(), false, 0, ""))
})
}
}
Expand Down
2 changes: 1 addition & 1 deletion service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ func (srv *Service) initExtensionsAndPipeline(ctx context.Context, set Settings,

if cfg.Telemetry.Metrics.Level != configtelemetry.LevelNone && cfg.Telemetry.Metrics.Address != "" {
// The process telemetry initialization requires the ballast size, which is available after the extensions are initialized.
if err = proctelemetry.RegisterProcessMetrics(srv.telemetryInitializer.ocRegistry, srv.telemetryInitializer.mp, obsreportconfig.UseOtelForInternalMetricsfeatureGate.IsEnabled(), getBallastSize(srv.host)); err != nil {
if err = proctelemetry.RegisterProcessMetrics(srv.telemetryInitializer.ocRegistry, srv.telemetryInitializer.mp, obsreportconfig.UseOtelForInternalMetricsfeatureGate.IsEnabled(), getBallastSize(srv.host), cfg.Telemetry.Metrics.HostProc); err != nil {
return fmt.Errorf("failed to register process metrics: %w", err)
}
}
Expand Down
5 changes: 5 additions & 0 deletions service/telemetry/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ type MetricsConfig struct {
// Readers allow configuration of metric readers to emit metrics to
// any number of supported backends.
Readers []MetricReader `mapstructure:"metric_readers"`

// HostProc is the path to the proc folder used to look up for the collector process.
// When unset, the collector will use the environment variable HOST_PROC if set, and
// default to `/proc` if no value is set.
atoulme marked this conversation as resolved.
Show resolved Hide resolved
HostProc string `mapstructure:"host_proc"`
}

// TracesConfig exposes the common Telemetry configuration for collector's internal spans.
Expand Down
25 changes: 22 additions & 3 deletions service/telemetry/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package telemetry

import (
"os"
"testing"

"github.com/stretchr/testify/assert"
Expand All @@ -13,9 +14,10 @@ import (

func TestLoadConfig(t *testing.T) {
tests := []struct {
name string
cfg *Config
success bool
name string
cfg *Config
success bool
hostproc string
}{
{
name: "basic metric telemetry",
Expand All @@ -37,10 +39,27 @@ func TestLoadConfig(t *testing.T) {
},
success: false,
},
{
name: "metric telemetry with valid host proc",
cfg: &Config{
Metrics: MetricsConfig{
Level: configtelemetry.LevelBasic,
Address: "127.0.0.1:3333",
HostProc: "/host/proc",
},
},
success: true,
hostproc: "/host/proc",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.hostproc != "" {
os.Setenv("HOST_PROC", tt.hostproc)
} else {
os.Unsetenv("HOST_PROC")
}
err := tt.cfg.Validate()
if tt.success {
assert.NoError(t, err)
Expand Down