From aef98c66f0803cf0d2e3c36d480f8ea43f5c0ce3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Mon, 30 Sep 2024 16:12:46 +0200 Subject: [PATCH] Ensure old processes are cleaned up --- pkg/gpu/consumer.go | 1 + pkg/gpu/context.go | 27 +++++++++++++++++++++++++-- pkg/gpu/stream.go | 2 ++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/pkg/gpu/consumer.go b/pkg/gpu/consumer.go index 775994250dcbe..26dc4e8191b0d 100644 --- a/pkg/gpu/consumer.go +++ b/pkg/gpu/consumer.go @@ -92,6 +92,7 @@ func (c *cudaEventConsumer) Start() { case <-health.C: case <-processSync.C: c.checkClosedProcesses() + c.sysCtx.cleanupOldEntries() case batchData, ok := <-dataChannel: if !ok { return diff --git a/pkg/gpu/context.go b/pkg/gpu/context.go index 91a0b0f1a0a6a..f931048bfa8bc 100644 --- a/pkg/gpu/context.go +++ b/pkg/gpu/context.go @@ -8,6 +8,7 @@ package gpu import ( "debug/elf" "fmt" + "time" "github.com/DataDog/datadog-agent/pkg/gpu/cuda" "github.com/DataDog/datadog-agent/pkg/util/kernel" @@ -27,8 +28,13 @@ type systemContext struct { // fileData holds the symbol table and Fatbin data for a given file. type fileData struct { - symbolTable map[uint64]string - fatbin *cuda.Fatbin + symbolTable map[uint64]string + fatbin *cuda.Fatbin + lastAccessed time.Time +} + +func (fd *fileData) updateAccessTime() { + fd.lastAccessed = time.Now() } func getSystemContext() (*systemContext, error) { @@ -63,6 +69,7 @@ func (ctx *systemContext) queryDevices() error { func (ctx *systemContext) getFileData(path string) (*fileData, error) { if fd, ok := ctx.fileData[path]; ok { + fd.updateAccessTime() return fd, nil } @@ -90,6 +97,7 @@ func (ctx *systemContext) getFileData(path string) (*fileData, error) { fd.symbolTable[sym.Value] = sym.Name } + fd.updateAccessTime() ctx.fileData[path] = fd return ctx.fileData[path], nil } @@ -107,3 +115,18 @@ func (ctx *systemContext) getProcessMemoryMaps(pid int) (*kernel.ProcMapEntries, ctx.pidMaps[pid] = &maps return &maps, nil } + +func (ctx *systemContext) cleanupDataForProcess(pid int) { + delete(ctx.pidMaps, pid) +} + +func (ctx *systemContext) cleanupOldEntries() { + maxFatbinAge := 5 * time.Minute + fatbinExpirationTime := time.Now().Add(-maxFatbinAge) + + for path, fd := range ctx.fileData { + if fd.lastAccessed.Before(fatbinExpirationTime) { + delete(ctx.fileData, path) + } + } +} diff --git a/pkg/gpu/stream.go b/pkg/gpu/stream.go index 6853357646603..a4e47322ea3a8 100644 --- a/pkg/gpu/stream.go +++ b/pkg/gpu/stream.go @@ -264,5 +264,7 @@ func (sh *StreamHandler) markEnd() error { sh.allocations = append(sh.allocations, &data) } + sh.sysCtx.cleanupDataForProcess(int(sh.key.Pid)) + return nil }