Make /proc/pid/io lookup failures skippable (#129)

## What does this PR do? This fixes a change in the behavior of the system metrics where fetching process metrics on a container without `SYS_PTRACE` results in a failure, as `/proc/pid/io` requires `SYS_PTRACE`. This flies under the radar in a handful of use cases, as our docs mention `SYS_PTRACE` in other contexts, and many users monitoring system data will (in my experience) end up passing `--privileged` due to [a variety of issues with permissions in `proc`](https://stackoverflow.com/questions/56573990/access-to-full-proc-in-docker-container). This issue also doesn't appear outside of docker. There is a test already that does cover this (although it'll catch it for a permissions error, not a container capability error): `TestFetchProcessFromOtherUser`, however, that test is usually skipped in CI, as the CI environment lacks more than one user. The exact test condition itself can't really be tested without running in a container. If we want to test this more thoroughly, we'll need some kind of integration test in beats that runs in a container and specifically looks out for skipped processes. These kinds of tests are often flaky, so they tend not to happen. Once we get buildkite set up in the beats repo, we should add a test that specifically looks for skipped metrics, probably by spinning up multiple sub-processes and having beats monitor them directly. ## Why is it important? This changes the behavior of system metrics inside a container. ## Checklist - [x] My code follows the style guidelines of this project - [x] I have commented my code, particularly in hard-to-understand areas - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added an entry in `CHANGELOG.md`
elastic · Mar 6, 2024 · 709b5c4 · 709b5c4
1 parent 7f008f4
commit 709b5c4
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
  - Fix CmdLine generation and caching for system.process
  - Fix process metrics collection when both cgroup V1 and V2 controllers exist
+ - Skip permissions errors when reading /proc/pid/io
 
 ## [0.7.0]
 

diff --git a/metric/system/process/process.go b/metric/system/process/process.go
@@ -237,7 +237,10 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
 	// If we've passed the filter, continue to fill out the rest of the metrics
 	status, err = FillPidMetrics(procStats.Hostfs, pid, status, procStats.isWhitelistedEnvVar)
 	if err != nil {
-		return status, true, fmt.Errorf("FillPidMetrics: %w", err)
+		if !errors.Is(err, NonFatalErr{}) {
+			return status, true, fmt.Errorf("FillPidMetrics: %w", err)
+		}
+		procStats.logger.Debugf("Non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %s", pid, err)
 	}
 
 	if status.CPU.Total.Ticks.Exists() {

diff --git a/metric/system/process/process_linux_common.go b/metric/system/process/process_linux_common.go
@@ -124,9 +124,10 @@ func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter fu
 		return state, fmt.Errorf("error creating username for pid %d: %w", pid, err)
 	}
 
+	// the /proc/[pid]/io metrics require SYS_PTRACE when run from inside docker
 	state.IO, err = getIOData(hostfs, pid)
 	if err != nil {
-		return state, fmt.Errorf("error fetching IO metrics for pid %d: %w", pid, err)
+		return state, NonFatalErr{Err: fmt.Errorf("/io unavailable; if running inside a container, use SYS_PTRACE: %w", err)}
 	}
 
 	return state, nil

diff --git a/metric/system/process/process_linux_test.go b/metric/system/process/process_linux_test.go
@@ -20,24 +20,124 @@
 package process
 
 import (
+	"errors"
+	"fmt"
 	"os"
+	"os/exec"
 	"os/user"
 	"strconv"
+	"strings"
+	"syscall"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
+	"github.com/elastic/elastic-agent-libs/logp"
 	"github.com/elastic/elastic-agent-libs/opt"
 	"github.com/elastic/elastic-agent-system-metrics/metric/system/cgroup"
 	"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
 )
 
+// CreateUser creates a user on the machine.
+func CreateUser(name string, gid int) (int, error) {
+	args := []string{
+		"--gid", strconv.Itoa(gid),
+		"--system",
+		"--no-user-group",
+		"--shell", "/usr/bin/false",
+		name,
+	}
+	cmd := exec.Command("useradd", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		command := fmt.Sprintf("useradd %s", strings.Join(args, " "))
+		return -1, fmt.Errorf("%s failed: %w (output: %s)", command, err, output)
+	}
+	return FindUID(name)
+}
+
+// FindUID returns the user's UID on the machine.
+func FindUID(name string) (int, error) {
+	id, err := getentGetID("passwd", name)
+	if e := (&exec.ExitError{}); errors.As(err, &e) {
+		if e.ExitCode() == 2 {
+			// exit code 2 is the key doesn't exist in the database
+			return -1, fmt.Errorf("User not found")
+		}
+	}
+	return id, err
+}
+
+// helper to get a passwd entry for a user
+func getentGetID(database string, key string) (int, error) {
+	cmd := exec.Command("getent", database, key)
+	output, err := cmd.Output()
+	if err != nil {
+		return -1, fmt.Errorf("getent %s %s failed: %w (output: %s)", database, key, err, output)
+	}
+	split := strings.Split(string(output), ":")
+	if len(split) < 3 {
+		return -1, fmt.Errorf("unexpected format: %s", output)
+	}
+	val, err := strconv.Atoi(split[2])
+	if err != nil {
+		return -1, fmt.Errorf("failed to convert %s to int: %w", split[2], err)
+	}
+	return val, nil
+}
+
+func TestRunningProcessFromOtherUser(t *testing.T) {
+	// test for permission errors by creating a new user, then running a process as that user
+	testUsername := "test"
+	uid, err := CreateUser(testUsername, 0)
+	require.NoError(t, err)
+	t.Logf("uid: %v", uid)
+
+	t.Cleanup(func() {
+		// not sure how ephemeral the CI environment is, but delete the user anyway
+		cmd := exec.Command("userdel", "-f", testUsername)
+		output, err := cmd.CombinedOutput()
+		require.NoError(t, err, "got error deleting user: %s", string(output))
+	})
+
+	cmdHandler := exec.Command("sleep", "60")
+	cmdHandler.SysProcAttr = &syscall.SysProcAttr{Credential: &syscall.Credential{Uid: uint32(uid), Gid: 0}}
+
+	err = cmdHandler.Start()
+	require.NoError(t, err)
+	runPid := cmdHandler.Process.Pid
+
+	testStats := Stats{CPUTicks: true,
+		EnableCgroups: true,
+		EnableNetwork: true,
+		Hostfs:        resolve.NewTestResolver("/"),
+		Procs:         []string{".*"},
+		CgroupOpts:    cgroup.ReaderOptions{RootfsMountpoint: resolve.NewTestResolver("/")},
+	}
+	err = testStats.Init()
+	require.NoError(t, err)
+
+	uname, err := user.Current()
+	require.NoError(t, err)
+
+	result, err := testStats.GetOne(runPid)
+	require.NoError(t, err)
+	// check to make sure we still got valid results
+	require.Equal(t, "sleep 60", result["cmdline"])
+	require.NotEqual(t, uname.Name, result["username"])
+	require.NotZero(t, result["memory"].(map[string]interface{})["size"])
+	t.Logf("got result: %s", result["username"])
+
+}
+
 func TestFetchProcessFromOtherUser(t *testing.T) {
+
 	// If we just used Get() or FetchPids() to get a list of processes on the system, this would produce a bootstrapping problem
 	// where if the code wasn't working (and we were skipping over PIDs not owned by us) this test would pass.
 	// re-implement part of the core pid-fetch logic
 	// All this does is find a pid that's not owned by us.
+	_ = logp.DevelopmentSetup()
 	dir, err := os.Open("/proc/")
 	require.NoError(t, err, "error opening /proc")