From 4daddf3615576b128d8c117af866caed95a5b52b Mon Sep 17 00:00:00 2001 From: Florian Lehner Date: Thu, 21 Dec 2023 07:38:47 +0100 Subject: [PATCH] metric/system/cgroup: introduce cache for cgroup v2 (#117) ## What does this PR do? Reintroduce the improvements from https://github.com/elastic/elastic-agent-system-metrics/pull/103. This PR got reverted with https://github.com/elastic/elastic-agent-system-metrics/pull/113 because of https://github.com/elastic/elastic-agent-system-metrics/issues/109. The issue, that got reported in https://github.com/elastic/elastic-agent-system-metrics/issues/109, was fixed with https://github.com/elastic/elastic-agent-system-metrics/pull/116. So its time to bring back the performance improvements by reintroducing the cache for cgroup v2. ## Why is it important? ## Checklist - [x] My code follows the style guidelines of this project - [x] I have commented my code, particularly in hard-to-understand areas - [ ] ~~I have added tests that prove my fix is effective or that my feature works~~ - [ ] ~~I have added an entry in `CHANGELOG.md`~~ --------- Signed-off-by: Florian Lehner --- metric/system/cgroup/cgstats.go | 4 ++-- metric/system/cgroup/doc.go | 2 +- metric/system/cgroup/reader.go | 17 +++++++++++++++ metric/system/cgroup/util.go | 29 ++++++++++++++++++++++++-- metric/system/filesystem/filesystem.go | 2 +- metric/system/host/host_test.go | 8 ++++--- metric/system/hwmon/hwmon.go | 4 ++-- metric/system/network/helpers_test.go | 3 ++- 8 files changed, 57 insertions(+), 12 deletions(-) diff --git a/metric/system/cgroup/cgstats.go b/metric/system/cgroup/cgstats.go index 4476b80bc1..a1e83c5ee3 100644 --- a/metric/system/cgroup/cgstats.go +++ b/metric/system/cgroup/cgstats.go @@ -40,7 +40,7 @@ func (stat StatsV1) CGVersion() CgroupsVersion { return CgroupsV1 } -//Format converts the stats object to a MapStr that can be sent to Report() +// Format converts the stats object to a MapStr that can be sent to Report() func (stat StatsV1) Format() (mapstr.M, error) { to := mapstr.M{} err := typeconv.Convert(&to, stat) @@ -95,7 +95,7 @@ func (stat *StatsV1) FillPercentages(prev CGStats, curTime, prevTime time.Time) stat.CPUAccounting.Stats.System.Norm.Pct = opt.FloatWith(metric.Round(normalizedSystem)) } -//Format converts the stats object to a MapStr that can be sent to Report() +// Format converts the stats object to a MapStr that can be sent to Report() func (stat StatsV2) Format() (mapstr.M, error) { to := mapstr.M{} err := typeconv.Convert(&to, stat) diff --git a/metric/system/cgroup/doc.go b/metric/system/cgroup/doc.go index c64f6073a9..983d8461d8 100644 --- a/metric/system/cgroup/doc.go +++ b/metric/system/cgroup/doc.go @@ -19,7 +19,7 @@ // control groups, a Linux kernel feature for grouping tasks to track and limit // resource usage. // -// Terminology +// # Terminology // // A cgroup is a collection of processes that are bound to a set of limits. // diff --git a/metric/system/cgroup/reader.go b/metric/system/cgroup/reader.go index 1debc009b9..33e84f4ace 100644 --- a/metric/system/cgroup/reader.go +++ b/metric/system/cgroup/reader.go @@ -23,6 +23,8 @@ import ( "path/filepath" "strconv" "strings" + "sync" + "time" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-system-metrics/metric/system/cgroup/cgv1" @@ -77,6 +79,17 @@ type mount struct { fullPath string // Absolute path to the cgroup. It's the mountpoint joined with the path. } +// pathListWithTime combines PathList with a timestamp. +type pathListWithTime struct { + added time.Time + pathList PathList +} + +type pathCache struct { + sync.RWMutex + cache map[string]pathListWithTime +} + // Reader reads cgroup metrics and limits. type Reader struct { // Mountpoint of the root filesystem. Defaults to / if not set. This can be @@ -85,6 +98,9 @@ type Reader struct { ignoreRootCgroups bool // Ignore a cgroup when its path is "/". cgroupsHierarchyOverride string cgroupMountpoints Mountpoints // Mountpoints for each subsystem (e.g. cpu, cpuacct, memory, blkio). + + // Cache to map known v2 cgroup controllerPaths to pathListWithTime. + v2ControllerPathCache pathCache } // ReaderOptions holds options for NewReaderOptions. @@ -135,6 +151,7 @@ func NewReaderOptions(opts ReaderOptions) (*Reader, error) { ignoreRootCgroups: opts.IgnoreRootCgroups, cgroupsHierarchyOverride: opts.CgroupsHierarchyOverride, cgroupMountpoints: mountpoints, + v2ControllerPathCache: pathCache{cache: make(map[string]pathListWithTime)}, }, nil } diff --git a/metric/system/cgroup/util.go b/metric/system/cgroup/util.go index 3677ade485..44f5356f34 100644 --- a/metric/system/cgroup/util.go +++ b/metric/system/cgroup/util.go @@ -25,6 +25,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" @@ -67,7 +68,7 @@ type PathList struct { // Flatten combines the V1 and V2 cgroups in cases where we don't need a map with keys func (pl PathList) Flatten() []ControllerPath { - list := []ControllerPath{} + list := make([]ControllerPath, 0, len(pl.V1)+len(pl.V2)) for _, v1 := range pl.V1 { list = append(list, v1) } @@ -229,7 +230,7 @@ func SubsystemMountpoints(rootfs resolve.Resolver, subsystems map[string]struct{ // ProcessCgroupPaths returns the cgroups to which a process belongs and the // pathname of the cgroup relative to the mountpoint of the subsystem. -func (r Reader) ProcessCgroupPaths(pid int) (PathList, error) { +func (r *Reader) ProcessCgroupPaths(pid int) (PathList, error) { cgroupPath := filepath.Join("proc", strconv.Itoa(pid), "cgroup") cgroup, err := os.Open(r.rootfsMountpoint.ResolveHostFS(cgroupPath)) if err != nil { @@ -288,6 +289,24 @@ the container as /sys/fs/cgroup/unified and start the system module with the hos controllerPath = r.rootfsMountpoint.ResolveHostFS(filepath.Join("/sys/fs/cgroup/unified", path)) } + // Check if there is an entry for controllerPath already cached. + r.v2ControllerPathCache.Lock() + cacheEntry, ok := r.v2ControllerPathCache.cache[controllerPath] + if ok { + // If the cached entry for controllerPath is not older than 5 minutes, + // return the cached entry. + if time.Since(cacheEntry.added) < 5*time.Minute { + cPaths.V2 = cacheEntry.pathList.V2 + r.v2ControllerPathCache.Unlock() + continue + } + + // Consider the existing entry for controllerPath invalid, as it is + // older than 5 minutes. + delete(r.v2ControllerPathCache.cache, controllerPath) + } + r.v2ControllerPathCache.Unlock() + cgpaths, err := os.ReadDir(controllerPath) if err != nil { return cPaths, fmt.Errorf("error fetching cgroupV2 controllers for cgroup location '%s' and path line '%s': %w", r.cgroupMountpoints.V2Loc, line, err) @@ -300,6 +319,12 @@ the container as /sys/fs/cgroup/unified and start the system module with the hos cPaths.V2[controllerName] = ControllerPath{ControllerPath: path, FullPath: controllerPath, IsV2: true} } } + r.v2ControllerPathCache.Lock() + r.v2ControllerPathCache.cache[controllerPath] = pathListWithTime{ + added: time.Now(), + pathList: cPaths, + } + r.v2ControllerPathCache.Unlock() // cgroup v1 } else { subsystems := strings.Split(fields[1], ",") diff --git a/metric/system/filesystem/filesystem.go b/metric/system/filesystem/filesystem.go index 6010931314..47c54920a4 100644 --- a/metric/system/filesystem/filesystem.go +++ b/metric/system/filesystem/filesystem.go @@ -34,7 +34,7 @@ import ( "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) -//FSStat carries the metadata for a given filesystem +// FSStat carries the metadata for a given filesystem type FSStat struct { Directory string `struct:"mount_point,omitempty"` Device string `struct:"device_name,omitempty"` diff --git a/metric/system/host/host_test.go b/metric/system/host/host_test.go index 36e963eaea..03a56e0f49 100644 --- a/metric/system/host/host_test.go +++ b/metric/system/host/host_test.go @@ -18,11 +18,13 @@ package host import ( - "github.com/elastic/elastic-agent-libs/mapstr" - "github.com/elastic/go-sysinfo/types" - "github.com/stretchr/testify/require" "testing" "time" + + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-libs/mapstr" + "github.com/elastic/go-sysinfo/types" ) func TestMapHostInfo(t *testing.T) { diff --git a/metric/system/hwmon/hwmon.go b/metric/system/hwmon/hwmon.go index f96cab5c7e..08eed71fdd 100644 --- a/metric/system/hwmon/hwmon.go +++ b/metric/system/hwmon/hwmon.go @@ -33,7 +33,7 @@ import ( "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) -//ReportSensors returns the metrics from all the known sensors. +// ReportSensors returns the metrics from all the known sensors. func ReportSensors(dev Device) (MonData, error) { metrics := MonData{} for _, sensor := range dev.Sensors { @@ -51,7 +51,7 @@ func ReportSensors(dev Device) (MonData, error) { return metrics, nil } -//DetectHwmon returns a list of hwmon sensors found on the system, if they exist +// DetectHwmon returns a list of hwmon sensors found on the system, if they exist func DetectHwmon(hostfs resolve.Resolver) ([]Device, error) { sensorTypeRegex := regexp.MustCompile("(^[a-z]*)([0-9]*)") fullPath := hostfs.ResolveHostFS(baseDir) diff --git a/metric/system/network/helpers_test.go b/metric/system/network/helpers_test.go index a1b918b6ea..e76d97c877 100644 --- a/metric/system/network/helpers_test.go +++ b/metric/system/network/helpers_test.go @@ -20,8 +20,9 @@ package network import ( "testing" - "github.com/elastic/go-sysinfo/types" "github.com/stretchr/testify/require" + + "github.com/elastic/go-sysinfo/types" ) func TestFilter(t *testing.T) {