Skip to content

Commit

Permalink
Fix metrics collection for privileged process on Windows (#104)
Browse files Browse the repository at this point in the history
## What does this PR do?

It improves metric collection on Windows hosts so we can collect some
metrics from privileged process like Elastic Endpoint or Elastic-Agent.
In order to achieve that metrics collection for Windows are grouped by
the access level required to collect metrics. First the metrics that can
be collected with `PROCESS_QUERY_LIMITED_INFORMATION` are collected,
then the others are collected. In case the second batch fails, we still
report the partial metrics.

## Why is it important?

It allows us to collect CPU and memory metrics from Endpoint and
Elastic-Agent, which improves our monitoring dashboards.

## Related issues

- Closes elastic/beats#17314
  • Loading branch information
belimawr authored Nov 3, 2023
1 parent 3bc3ccc commit e13673b
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 24 deletions.
49 changes: 40 additions & 9 deletions metric/system/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package process

import (
"context"
"errors"
"fmt"
"os"
"sort"
Expand Down Expand Up @@ -168,8 +169,11 @@ func (procStats *Stats) GetSelf() (ProcState, error) {
func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) (ProcsMap, []ProcState) {
status, saved, err := procStats.pidFill(pid, true)
if err != nil {
procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err)
return procMap, proclist
if !errors.Is(err, NonFatalErr{}) {
procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err)
return procMap, proclist
}
procStats.logger.Debugf("Non fatal error fetching PID some info for %d, metrics are valid, but partial: %s", pid, err)
}
if !saved {
procStats.logger.Debugf("Process name does not match the provided regex; PID=%d; name=%s", pid, status.Name)
Expand All @@ -181,6 +185,28 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState)
return procMap, proclist
}

// NonFatalErr is returned when there was an error
// collecting metrics, however the metrics already
// gathered and returned are still valid.
// This error can be safely ignored, this will result
// in having partial metrics for a process rather than
// no metrics at all.
//
// It was introduced to allow for partial metrics collection
// on privileged process on Windows.
type NonFatalErr struct {
Err error
}

func (c NonFatalErr) Error() string {
return "Not enough privileges to fetch information: " + c.Err.Error()
}

func (c NonFatalErr) Is(other error) bool {
_, is := other.(NonFatalErr)
return is
}

// pidFill is an entrypoint used by OS-specific code to fill out a pid.
// This in turn calls various OS-specific code to fill out the various bits of PID data
// This is done to minimize the code duplication between different OS implementations
Expand Down Expand Up @@ -213,10 +239,17 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
if len(status.Args) > 0 && status.Cmdline == "" {
status.Cmdline = strings.Join(status.Args, " ")
}
if status.CPU.Total.Ticks.Exists() {
status.CPU.Total.Value = opt.FloatWith(metric.Round(float64(status.CPU.Total.Ticks.ValueOr(0))))
}

// postprocess with cgroups and percentages
last, ok := procStats.ProcsMap.GetPid(status.Pid.ValueOr(0))
status.SampleTime = time.Now()
if ok {
status = GetProcCPUPercentage(last, status)
}

if procStats.EnableCgroups {
cgStats, err := procStats.cgroups.GetStatsForPid(status.Pid.ValueOr(0))
if err != nil {
Expand All @@ -228,6 +261,11 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
}
} // end cgroups processor

status, err = FillMetricsRequiringMoreAccess(pid, status)
if err != nil {
return status, true, fmt.Errorf("FillMetricsRequiringMoreAccess: %w", err)
}

// network data
if procStats.EnableNetwork {
procHandle, err := sysinfo.Process(pid)
Expand All @@ -245,13 +283,6 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
}
}

if status.CPU.Total.Ticks.Exists() {
status.CPU.Total.Value = opt.FloatWith(metric.Round(float64(status.CPU.Total.Ticks.ValueOr(0))))
}
if ok {
status = GetProcCPUPercentage(last, status)
}

return status, true, nil
}

Expand Down
4 changes: 4 additions & 0 deletions metric/system/process/process_aix.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,3 +195,7 @@ func FillPidMetrics(_ resolve.Resolver, pid int, state ProcState, filter func(st

return state, nil
}

func FillMetricsRequiringMoreAccess(_ int, state ProcState) (ProcState, error) {
return state, nil
}
4 changes: 4 additions & 0 deletions metric/system/process/process_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,7 @@ func sysctl(mib []C.int, old *byte, oldlen *uintptr,
}
return err
}

func FillMetricsRequiringMoreAccess(_ int, state ProcState) (ProcState, error) {
return state, nil
}
4 changes: 4 additions & 0 deletions metric/system/process/process_linux_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,3 +460,7 @@ func getProcState(b byte) PidState {
}
return Unknown
}

func FillMetricsRequiringMoreAccess(_ int, state ProcState) (ProcState, error) {
return state, nil
}
6 changes: 5 additions & 1 deletion metric/system/process/process_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,11 @@ func runThreads(t *testing.T) *exec.Cmd {
var log string
require.Eventually(t,
func() bool {
log += b.String()
if cmd.ProcessState != nil {
t.Fatalf("Process exited with error: '%s'", cmd.ProcessState.String())
}
line := b.String()
log += line
return strings.Contains(log, "running")
},
time.Second, 50*time.Millisecond,
Expand Down
32 changes: 22 additions & 10 deletions metric/system/process/process_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,6 @@ func GetInfoForPid(_ resolve.Resolver, pid int) (ProcState, error) {
state.State = status
}

if numThreads, err := FetchNumThreads(pid); err != nil {
errs = append(errs, fmt.Errorf("error fetching num threads: %w", err))
} else {
state.NumThreads = opt.IntWith(numThreads)
}

if err := errors.Join(errs...); err != nil {
return state, fmt.Errorf("could not get all information for PID %d: %w",
pid, err)
Expand Down Expand Up @@ -140,11 +134,25 @@ func FillPidMetrics(_ resolve.Resolver, pid int, state ProcState, _ func(string)

state.CPU.StartTime = unixTimeMsToTime(startTime)

return state, nil
}

// FillMetricsRequiringMoreAccess
// All calls that need more access rights than
// windows.PROCESS_QUERY_LIMITED_INFORMATION
func FillMetricsRequiringMoreAccess(pid int, state ProcState) (ProcState, error) {
argList, err := getProcArgs(pid)
if err != nil {
return state, fmt.Errorf("error fetching process args: %w", err)
return state, fmt.Errorf("error fetching process args: %w", NonFatalErr{Err: err})
}
state.Args = argList

if numThreads, err := FetchNumThreads(pid); err != nil {
return state, fmt.Errorf("error fetching num threads: %w", NonFatalErr{Err: err})
} else {
state.NumThreads = opt.IntWith(numThreads)
}

return state, nil
}

Expand Down Expand Up @@ -199,10 +207,14 @@ func getProcTimes(pid int) (uint64, uint64, uint64, error) {
return uint64(windows.FiletimeToDuration(&cpu.UserTime).Nanoseconds() / 1e6), uint64(windows.FiletimeToDuration(&cpu.KernelTime).Nanoseconds() / 1e6), uint64(cpu.CreationTime.Nanoseconds() / 1e6), nil
}

// procMem gets the memory usage for the given PID.
// The current implementation calls
// GetProcessMemoryInfo (https://learn.microsoft.com/en-us/windows/win32/api/psapi/nf-psapi-getprocessmemoryinfo)
// We only need `PROCESS_QUERY_LIMITED_INFORMATION` because we do not support
// Windows Server 2003 or Windows XP
func procMem(pid int) (uint64, uint64, error) {
handle, err := syscall.OpenProcess(
windows.PROCESS_QUERY_LIMITED_INFORMATION|
windows.PROCESS_VM_READ,
windows.PROCESS_QUERY_LIMITED_INFORMATION,
false,
uint32(pid))
if err != nil {
Expand Down Expand Up @@ -278,7 +290,7 @@ func getParentPid(pid int) (int, error) {
}

func getProcCredName(pid int) (string, error) {
handle, err := syscall.OpenProcess(syscall.PROCESS_QUERY_INFORMATION, false, uint32(pid))
handle, err := syscall.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
if err != nil {
return "", fmt.Errorf("OpenProcess failed for pid=%v: %w", pid, err)
}
Expand Down
13 changes: 9 additions & 4 deletions metric/system/process/process_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,24 @@ func TestGetInfoForPid_numThreads(t *testing.T) {
expected := 45

cmd := runThreads(t)
got, err := GetInfoForPid(
state, err := GetInfoForPid(
resolve.NewTestResolver("/"), cmd.Process.Pid)
require.NoError(t, err, "failed to GetInfoForPid")

if !got.NumThreads.Exists() {
bs, err := json.Marshal(got)
state, err = FillMetricsRequiringMoreAccess(cmd.Process.Pid, state)
if err != nil {
t.Fatalf("error calling FillMetricsRequiringMoreAccess: %s", err)
}

if !state.NumThreads.Exists() {
bs, err := json.Marshal(state)
if err != nil {
t.Logf("could not marshal ProcState: %v", err)
}
t.Fatalf("num_thread was not collected. Collected info: %s", bs)
}

numThreads := got.NumThreads.ValueOr(-1)
numThreads := state.NumThreads.ValueOr(-1)
if expected != numThreads {
// it might be an older Windows version or, by the time we got the num_threads,
// the program was indeed using only what it spawned
Expand Down

0 comments on commit e13673b

Please sign in to comment.