Skip to content

Commit

Permalink
[chore][process] narrow the log that is passed to the caller (#195)
Browse files Browse the repository at this point in the history
This is to reduce the log pollution in default case. If the user is
interested to view the full log, they can enable "debug" logging.

Closes elastic/beats#41890
  • Loading branch information
VihasMakwana authored Dec 12, 2024
1 parent 76d05d1 commit fa3caa1
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 8 deletions.
4 changes: 2 additions & 2 deletions metric/system/process/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ type NonFatalErr struct {

func (c NonFatalErr) Error() string {
if c.Err != nil {
return "Not enough privileges to fetch information: " + c.Err.Error()
return "non fatal error; reporting partial metrics: " + c.Err.Error()
}
return "Not enough privileges to fetch information"
return "non fatal error"
}

func (c NonFatalErr) Is(other error) bool {
Expand Down
35 changes: 29 additions & 6 deletions metric/system/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
sysinfotypes "github.com/elastic/go-sysinfo/types"
)

var errFetchingPIDs = "error fetching PID metrics for %d processes, most likely a \"permission denied\" error. Enable debug logging to determine the exact cause."

// ListStates is a wrapper that returns a list of processess with only the basic PID info filled out.
func ListStates(hostfs resolve.Resolver) ([]ProcState, error) {
init := Stats{
Expand All @@ -54,11 +56,15 @@ func ListStates(hostfs resolve.Resolver) ([]ProcState, error) {
}

// actually fetch the PIDs from the OS-specific code
_, plist, err := init.FetchPids()
pidMap, plist, err := init.FetchPids()
if err != nil && !isNonFatal(err) {
return nil, fmt.Errorf("error gathering PIDs: %w", err)
}

failedPIDs := extractFailedPIDs(pidMap)
if err != nil && len(failedPIDs) > 0 {
init.logger.Debugf("error fetching process metrics: %v", err)
return plist, NonFatalErr{Err: fmt.Errorf(errFetchingPIDs, len(failedPIDs))}
}
return plist, toNonFatal(err)
}

Expand Down Expand Up @@ -96,6 +102,7 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) {
if wrappedErr != nil && !isNonFatal(wrappedErr) {
return nil, nil, fmt.Errorf("error gathering PIDs: %w", wrappedErr)
}
failedPIDs := extractFailedPIDs(pidMap)
// We use this to track processes over time.
procStats.ProcsMap.SetMap(pidMap)

Expand Down Expand Up @@ -133,8 +140,11 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) {
procs = append(procs, proc)
rootEvents = append(rootEvents, rootMap)
}

return procs, rootEvents, toNonFatal(wrappedErr)
if len(failedPIDs) > 0 {
procStats.logger.Debugf("error fetching process metrics: %v", wrappedErr)
return procs, rootEvents, NonFatalErr{Err: fmt.Errorf(errFetchingPIDs, len(failedPIDs))}
}
return procs, rootEvents, nil
}

// GetOne fetches process data for a given PID if its name matches the regexes provided from the host.
Expand Down Expand Up @@ -197,6 +207,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState)
status, saved, err := procStats.pidFill(pid, true)
var nonFatalErr error
if err != nil {
procMap[pid] = ProcState{Failed: true}
if !errors.Is(err, NonFatalErr{}) {
procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err)
// While monitoring a set of processes, some processes might get killed after we get all the PIDs
Expand All @@ -206,7 +217,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState)
}
return procMap, proclist, err
}
nonFatalErr = fmt.Errorf("non fatal error fetching PID some info for %d, metrics are valid, but partial: %w", pid, err)
nonFatalErr = fmt.Errorf("error for pid %d: %w", pid, err)
procStats.logger.Debugf(err.Error())
}
if !saved {
Expand Down Expand Up @@ -252,7 +263,7 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
if !errors.Is(err, NonFatalErr{}) {
return status, true, fmt.Errorf("FillPidMetrics failed for PID %d: %w", pid, err)
}
wrappedErr = errors.Join(wrappedErr, fmt.Errorf("non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %w", pid, err))
wrappedErr = errors.Join(wrappedErr, err)
procStats.logger.Debugf(wrappedErr.Error())
}

Expand Down Expand Up @@ -409,3 +420,15 @@ func (procStats *Stats) isWhitelistedEnvVar(varName string) bool {
}
return false
}

func extractFailedPIDs(procMap ProcsMap) []int {
list := make([]int, 0)
for pid, state := range procMap {
if state.Failed {
list = append(list, pid)
// delete the failed state so we don't return the state to caller
delete(procMap, pid)
}
}
return list
}
3 changes: 3 additions & 0 deletions metric/system/process/process_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ type ProcState struct {

// meta
SampleTime time.Time `struct:"-,omitempty"`

// boolean to indicate that given PID has failed due to some error.
Failed bool `struct:"-,omitempty"`
}

// ProcCPUInfo is the main struct for CPU metrics
Expand Down

0 comments on commit fa3caa1

Please sign in to comment.