diff --git a/pkg/config/setup/system_probe.go b/pkg/config/setup/system_probe.go index 21870d884151c..e7754f72c842b 100644 --- a/pkg/config/setup/system_probe.go +++ b/pkg/config/setup/system_probe.go @@ -52,6 +52,9 @@ const ( defaultZypperReposDirSuffix = "/zypp/repos.d" defaultOffsetThreshold = 400 + + // defaultEnvoyPath is the default path for envoy binary + defaultEnvoyPath = "/bin/envoy" ) var ( @@ -239,6 +242,7 @@ func InitSystemProbeConfig(cfg pkgconfigmodel.Config) { cfg.BindEnv(join(smNS, "enable_postgres_monitoring")) cfg.BindEnv(join(smNS, "enable_redis_monitoring")) cfg.BindEnvAndSetDefault(join(smNS, "tls", "istio", "enabled"), false) + cfg.BindEnvAndSetDefault(join(smNS, "tls", "istio", "envoy_path"), defaultEnvoyPath) cfg.BindEnv(join(smNS, "tls", "nodejs", "enabled")) cfg.BindEnvAndSetDefault(join(smjtNS, "enabled"), false) cfg.BindEnvAndSetDefault(join(smjtNS, "debug"), false) diff --git a/pkg/network/config/config.go b/pkg/network/config/config.go index 4689ebf0f27b6..cc7d9c7640d6a 100644 --- a/pkg/network/config/config.go +++ b/pkg/network/config/config.go @@ -96,6 +96,9 @@ type Config struct { // EnableIstioMonitoring specifies whether USM should monitor Istio traffic EnableIstioMonitoring bool + // EnvoyPath specifies the envoy path to be used for Istio monitoring + EnvoyPath string + // EnableNodeJSMonitoring specifies whether USM should monitor NodeJS TLS traffic EnableNodeJSMonitoring bool @@ -369,6 +372,7 @@ func New() *Config { EnableRedisMonitoring: cfg.GetBool(join(smNS, "enable_redis_monitoring")), EnableNativeTLSMonitoring: cfg.GetBool(join(smNS, "tls", "native", "enabled")), EnableIstioMonitoring: cfg.GetBool(join(smNS, "tls", "istio", "enabled")), + EnvoyPath: cfg.GetString(join(smNS, "tls", "istio", "envoy_path")), EnableNodeJSMonitoring: cfg.GetBool(join(smNS, "tls", "nodejs", "enabled")), MaxUSMConcurrentRequests: uint32(cfg.GetInt(join(smNS, "max_concurrent_requests"))), MaxHTTPStatsBuffered: cfg.GetInt(join(smNS, "max_http_stats_buffered")), diff --git a/pkg/network/config/config_test.go b/pkg/network/config/config_test.go index f1abf3879398f..04deabbf893b8 100644 --- a/pkg/network/config/config_test.go +++ b/pkg/network/config/config_test.go @@ -1416,6 +1416,33 @@ service_monitoring_config: }) } +func TestEnvoyPathConfig(t *testing.T) { + t.Run("default value", func(t *testing.T) { + aconfig.ResetSystemProbeConfig(t) + cfg := New() + assert.EqualValues(t, cfg.EnvoyPath, "/bin/envoy") + }) + + t.Run("via yaml", func(t *testing.T) { + aconfig.ResetSystemProbeConfig(t) + cfg := configurationFromYAML(t, ` +service_monitoring_config: + tls: + istio: + envoy_path: "/test/envoy" +`) + assert.EqualValues(t, "/test/envoy", cfg.EnvoyPath) + }) + + t.Run("value set through env var", func(t *testing.T) { + aconfig.ResetSystemProbeConfig(t) + t.Setenv("DD_SERVICE_MONITORING_CONFIG_TLS_ISTIO_ENVOY_PATH", "/test/envoy") + + cfg := New() + assert.EqualValues(t, "/test/envoy", cfg.EnvoyPath) + }) +} + func TestNodeJSMonitoring(t *testing.T) { t.Run("default value", func(t *testing.T) { aconfig.ResetSystemProbeConfig(t) diff --git a/pkg/network/usm/ebpf_gotls.go b/pkg/network/usm/ebpf_gotls.go index 15bebbdd48a8e..0eb4a1613cec3 100644 --- a/pkg/network/usm/ebpf_gotls.go +++ b/pkg/network/usm/ebpf_gotls.go @@ -319,21 +319,8 @@ func (p *goTLSProgram) AttachPID(pid uint32) error { pidAsStr := strconv.FormatUint(uint64(pid), 10) exePath := filepath.Join(p.procRoot, pidAsStr, "exe") - binPath, err := os.Readlink(exePath) + binPath, err := utils.ResolveSymlink(exePath) if err != nil { - // We receive the Exec event, /proc could be slow to update - end := time.Now().Add(10 * time.Millisecond) - for end.After(time.Now()) { - binPath, err = os.Readlink(exePath) - if err == nil { - break - } - time.Sleep(time.Millisecond) - } - } - if err != nil { - // we can't access to the binary path here (pid probably ended already) - // there are not much we can do, and we don't want to flood the logs return err } diff --git a/pkg/network/usm/istio.go b/pkg/network/usm/istio.go index 4f9a911e1f29d..e5f291b647f87 100644 --- a/pkg/network/usm/istio.go +++ b/pkg/network/usm/istio.go @@ -8,9 +8,8 @@ package usm import ( - "bytes" "fmt" - "os" + "strings" "sync" "time" @@ -19,8 +18,6 @@ import ( "github.com/DataDog/datadog-agent/pkg/process/monitor" "github.com/DataDog/datadog-agent/pkg/util/kernel" "github.com/DataDog/datadog-agent/pkg/util/log" - ddsync "github.com/DataDog/datadog-agent/pkg/util/sync" - manager "github.com/DataDog/ebpf-manager" ) @@ -76,16 +73,6 @@ var istioProbes = []manager.ProbesSelector{ }, } -// envoyCmd represents the search term used for determining -// whether or not a given PID represents an Envoy process. -// The search is done over the /proc//cmdline file. -var envoyCmd = []byte("/bin/envoy") - -// readBufferPool is used for reading /proc//cmdline files. -// We use a pointer to a slice to avoid allocations when casting -// values to the empty interface during Put() calls. -var readBufferPool = ddsync.NewSlicePool[byte](128, 128) - // istioMonitor essentially scans for Envoy processes and attaches SSL uprobes // to them. // @@ -95,6 +82,7 @@ var readBufferPool = ddsync.NewSlicePool[byte](128, 128) type istioMonitor struct { registry *utils.FileRegistry procRoot string + envoyCmd string // `utils.FileRegistry` callbacks registerCB func(utils.FilePath) error @@ -117,6 +105,7 @@ func newIstioMonitor(c *config.Config, mgr *manager.Manager) *istioMonitor { return &istioMonitor{ registry: utils.NewFileRegistry("istio"), procRoot: procRoot, + envoyCmd: c.EnvoyPath, done: make(chan struct{}), // Callbacks @@ -250,46 +239,19 @@ func (m *istioMonitor) handleProcessExec(pid uint32) { } // getEnvoyPath returns the executable path of the envoy binary for a given PID. -// In case the PID doesn't represent an envoy process, an empty string is returned. +// It constructs the path to the symbolic link for the executable file of the process with the given PID, +// then resolves this symlink to determine the actual path of the binary. // -// TODO: -// refine process detection heuristic so we can remove the number of false -// positives. A common case that is likely worth optimizing for is filtering -// out "vanilla" envoy processes, and selecting only envoy processes that are -// running inside istio containers. Based on a quick inspection I made, it -// seems that we could also search for "istio" in the cmdline string in addition -// to "envoy", since the command line arguments look more or less the following: -// -// /usr/local/bin/envoy -cetc/istio/proxy/envoy-rev.json ... +// If the resolved path contains the expected envoy command substring (as defined by m.envoyCmd), +// the function returns this path. If the PID does not correspond to an envoy process or if an error +// occurs during resolution, it returns an empty string. func (m *istioMonitor) getEnvoyPath(pid uint32) string { - cmdlinePath := fmt.Sprintf("%s/%d/cmdline", m.procRoot, pid) - - f, err := os.Open(cmdlinePath) - if err != nil { - // This can happen often in the context of ephemeral processes - return "" - } - defer f.Close() - - // From here on we shouldn't allocate for the common case - // (eg., a process is *not* envoy) - bufferPtr := readBufferPool.Get() - defer func() { - readBufferPool.Put(bufferPtr) - }() - - buffer := *bufferPtr - n, _ := f.Read(buffer) - if n == 0 { - return "" - } + exePath := fmt.Sprintf("%s/%d/exe", m.procRoot, pid) - buffer = buffer[:n] - i := bytes.Index(buffer, envoyCmd) - if i < 0 { + envoyPath, err := utils.ResolveSymlink(exePath) + if err != nil || !strings.Contains(envoyPath, m.envoyCmd) { return "" } - executable := buffer[:i+len(envoyCmd)] - return string(executable) + return envoyPath } diff --git a/pkg/network/usm/istio_test.go b/pkg/network/usm/istio_test.go index db08722435444..e61c3372ccc77 100644 --- a/pkg/network/usm/istio_test.go +++ b/pkg/network/usm/istio_test.go @@ -8,25 +8,28 @@ package usm import ( - "os" + "os/exec" "path/filepath" + "strings" "testing" "github.com/DataDog/datadog-agent/pkg/network/config" "github.com/DataDog/datadog-agent/pkg/network/usm/utils" - "github.com/DataDog/datadog-agent/pkg/util/kernel" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) +const ( + defaultEnvoyName = "/bin/envoy" +) + func TestGetEnvoyPath(t *testing.T) { - _ = createFakeProcFS(t) + _, pid := createFakeProcess(t, defaultEnvoyName) monitor := newIstioTestMonitor(t) t.Run("an actual envoy process", func(t *testing.T) { - path := monitor.getEnvoyPath(uint32(1)) - assert.Equal(t, "/usr/local/bin/envoy", path) + path := monitor.getEnvoyPath(uint32(pid)) + assert.True(t, strings.HasSuffix(path, defaultEnvoyName)) }) t.Run("something else", func(t *testing.T) { path := monitor.getEnvoyPath(uint32(2)) @@ -34,32 +37,22 @@ func TestGetEnvoyPath(t *testing.T) { }) } -func TestIstioSync(t *testing.T) { - t.Run("calling sync for the first time", func(t *testing.T) { - procRoot := createFakeProcFS(t) - monitor := newIstioTestMonitor(t) - registerRecorder := new(utils.CallbackRecorder) - - // Setup test callbacks - monitor.registerCB = registerRecorder.Callback() - monitor.unregisterCB = utils.IgnoreCB - - // Calling sync should detect the two envoy processes - monitor.sync() - - pathID1, err := utils.NewPathIdentifier(filepath.Join(procRoot, "1/root/usr/local/bin/envoy")) - require.NoError(t, err) +func TestGetEnvoyPathWithConfig(t *testing.T) { + cfg := config.New() + cfg.EnableIstioMonitoring = true + cfg.EnvoyPath = "/test/envoy" + monitor := newIstioTestMonitorWithCFG(t, cfg) - pathID2, err := utils.NewPathIdentifier(filepath.Join(procRoot, "3/root/usr/local/bin/envoy")) - require.NoError(t, err) + _, pid := createFakeProcess(t, cfg.EnvoyPath) - assert.Equal(t, 2, registerRecorder.TotalCalls()) - assert.Equal(t, 1, registerRecorder.CallsForPathID(pathID1)) - assert.Equal(t, 1, registerRecorder.CallsForPathID(pathID2)) - }) + path := monitor.getEnvoyPath(uint32(pid)) + assert.True(t, strings.HasSuffix(path, cfg.EnvoyPath)) +} +func TestIstioSync(t *testing.T) { t.Run("calling sync multiple times", func(t *testing.T) { - procRoot := createFakeProcFS(t) + procRoot1, _ := createFakeProcess(t, filepath.Join("test1", defaultEnvoyName)) + procRoot2, _ := createFakeProcess(t, filepath.Join("test2", defaultEnvoyName)) monitor := newIstioTestMonitor(t) registerRecorder := new(utils.CallbackRecorder) @@ -72,12 +65,11 @@ func TestIstioSync(t *testing.T) { // trigger additional callback executions monitor.sync() monitor.sync() - monitor.sync() - pathID1, err := utils.NewPathIdentifier(filepath.Join(procRoot, "1/root/usr/local/bin/envoy")) + pathID1, err := utils.NewPathIdentifier(procRoot1) require.NoError(t, err) - pathID2, err := utils.NewPathIdentifier(filepath.Join(procRoot, "3/root/usr/local/bin/envoy")) + pathID2, err := utils.NewPathIdentifier(procRoot2) require.NoError(t, err) // Each PathID should have triggered a callback exactly once @@ -85,122 +77,37 @@ func TestIstioSync(t *testing.T) { assert.Equal(t, 1, registerRecorder.CallsForPathID(pathID1)) assert.Equal(t, 1, registerRecorder.CallsForPathID(pathID2)) }) +} - t.Run("detecting a dangling process", func(t *testing.T) { - procRoot := createFakeProcFS(t) - monitor := newIstioTestMonitor(t) - registerRecorder := new(utils.CallbackRecorder) - unregisterRecorder := new(utils.CallbackRecorder) - - // Setup test callbacks - monitor.registerCB = registerRecorder.Callback() - monitor.unregisterCB = unregisterRecorder.Callback() - - monitor.sync() - - // The first call to sync() will start tracing PIDs 1 and 3, but not PID 2 - assert.Contains(t, monitor.registry.GetRegisteredProcesses(), uint32(1)) - assert.NotContains(t, monitor.registry.GetRegisteredProcesses(), uint32(2)) - assert.Contains(t, monitor.registry.GetRegisteredProcesses(), uint32(3)) - - // At this point we should have received: - // * 2 register calls - // * 0 unregister calls - assert.Equal(t, 2, registerRecorder.TotalCalls()) - assert.Equal(t, 0, unregisterRecorder.TotalCalls()) +// createFakeProcess creates a fake process in a temporary location. +// returns the full path of the temporary process and the PID of the fake process. +func createFakeProcess(t *testing.T, processName string) (procRoot string, pid int) { + fakePath := filepath.Join(t.TempDir(), processName) + require.NoError(t, exec.Command("mkdir", "-p", filepath.Dir(fakePath)).Run()) - // Now we emulate a process termination for PID 3 by removing it from the fake - // procFS tree - require.NoError(t, os.RemoveAll(filepath.Join(procRoot, "3"))) + // we are using the `yes` command as a fake envoy binary. + require.NoError(t, exec.Command("cp", "/usr/bin/yes", fakePath).Run()) - // Once we call sync() again, PID 3 termination should be detected - // and the unregister callback should be executed - monitor.sync() - assert.Equal(t, 1, unregisterRecorder.TotalCalls()) - assert.NotContains(t, monitor.registry.GetRegisteredProcesses(), uint32(3)) - }) -} + cmd := exec.Command(fakePath) + require.NoError(t, cmd.Start()) -// This creates a bare-bones procFS with a structure that looks like -// the following: -// -// proc/ -// ├── 1 -// │   ├── cmdline -// │   └── root -// │   └── usr -// │   └── local -// │   └── bin -// │   └── envoy -// ... -// -// This ProcFS contains 3 PIDs: -// -// PID 1 -> Envoy process -// PID 2 -> Bash process -// PID 3 -> Envoy process -func createFakeProcFS(t *testing.T) (procRoot string) { - procRoot = t.TempDir() - - // Inject fake ProcFS path - previousFn := kernel.ProcFSRoot - kernel.ProcFSRoot = func() string { return procRoot } + // Schedule process termination after the test t.Cleanup(func() { - kernel.ProcFSRoot = previousFn + _ = cmd.Process.Kill() }) - // Taken from a real istio-proxy container - const envoyCmdline = "/usr/local/bin/envoy" + - "-cetc/istio/proxy/envoy-rev.json" + - "--drain-time-s45" + - "--drain-strategyimmediate" + - "--local-address-ip-versionv4" + - "--file-flush-interval-msec1000" + - "--disable-hot-restart" + - "--allow-unknown-static-fields" + - "--log-format" - - // PID 1 - createFile(t, - filepath.Join(procRoot, "1", "cmdline"), - envoyCmdline, - ) - createFile(t, - filepath.Join(procRoot, "1", "root/usr/local/bin/envoy"), - "", - ) - - // PID 2 - createFile(t, - filepath.Join(procRoot, "2", "cmdline"), - "/bin/bash", - ) - - // PID 3 - createFile(t, - filepath.Join(procRoot, "3", "cmdline"), - envoyCmdline, - ) - createFile(t, - filepath.Join(procRoot, "3", "root/usr/local/bin/envoy"), - "", - ) - - return -} - -func createFile(t *testing.T, path, data string) { - dir := filepath.Dir(path) - require.NoError(t, os.MkdirAll(dir, 0775)) - require.NoError(t, os.WriteFile(path, []byte(data), 0775)) + return fakePath, cmd.Process.Pid } func newIstioTestMonitor(t *testing.T) *istioMonitor { cfg := config.New() cfg.EnableIstioMonitoring = true + return newIstioTestMonitorWithCFG(t, cfg) +} + +func newIstioTestMonitorWithCFG(t *testing.T, cfg *config.Config) *istioMonitor { monitor := newIstioMonitor(cfg, nil) require.NotNil(t, monitor) - return monitor } diff --git a/pkg/network/usm/utils/pathutils.go b/pkg/network/usm/utils/pathutils.go new file mode 100644 index 0000000000000..088ac110f500b --- /dev/null +++ b/pkg/network/usm/utils/pathutils.go @@ -0,0 +1,38 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build linux + +package utils + +import ( + "os" + "time" +) + +// ResolveSymlink returns the target path of the symbolic link specified by linkPath. +// If the link target is relative, it returns the relative path without resolving it to an absolute path. +// If the link target cannot be resolved immediately, it retries for a short period. +func ResolveSymlink(linkPath string) (string, error) { + targetPath, err := os.Readlink(linkPath) + if err != nil { + // If Readlink fails, retry for up to 10 milliseconds in case of transient issues. + end := time.Now().Add(10 * time.Millisecond) + for end.After(time.Now()) { + targetPath, err = os.Readlink(linkPath) + if err == nil { + break + } + time.Sleep(time.Millisecond) + } + } + + if err != nil { + // we can't access to the binary path here (pid probably ended already) + // there are not much we can do, and we don't want to flood the logs + return "", err + } + return targetPath, nil +}