Skip to content

Commit

Permalink
Ensure old processes are cleaned up
Browse files Browse the repository at this point in the history
  • Loading branch information
gjulianm committed Oct 7, 2024
1 parent bbc7fbf commit aef98c6
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 2 deletions.
1 change: 1 addition & 0 deletions pkg/gpu/consumer.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ func (c *cudaEventConsumer) Start() {
case <-health.C:
case <-processSync.C:
c.checkClosedProcesses()
c.sysCtx.cleanupOldEntries()
case batchData, ok := <-dataChannel:
if !ok {
return
Expand Down
27 changes: 25 additions & 2 deletions pkg/gpu/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package gpu
import (
"debug/elf"
"fmt"
"time"

"github.com/DataDog/datadog-agent/pkg/gpu/cuda"
"github.com/DataDog/datadog-agent/pkg/util/kernel"
Expand All @@ -27,8 +28,13 @@ type systemContext struct {

// fileData holds the symbol table and Fatbin data for a given file.
type fileData struct {
symbolTable map[uint64]string
fatbin *cuda.Fatbin
symbolTable map[uint64]string
fatbin *cuda.Fatbin
lastAccessed time.Time
}

func (fd *fileData) updateAccessTime() {
fd.lastAccessed = time.Now()
}

func getSystemContext() (*systemContext, error) {
Expand Down Expand Up @@ -63,6 +69,7 @@ func (ctx *systemContext) queryDevices() error {

func (ctx *systemContext) getFileData(path string) (*fileData, error) {
if fd, ok := ctx.fileData[path]; ok {
fd.updateAccessTime()
return fd, nil
}

Expand Down Expand Up @@ -90,6 +97,7 @@ func (ctx *systemContext) getFileData(path string) (*fileData, error) {
fd.symbolTable[sym.Value] = sym.Name
}

fd.updateAccessTime()
ctx.fileData[path] = fd
return ctx.fileData[path], nil
}
Expand All @@ -107,3 +115,18 @@ func (ctx *systemContext) getProcessMemoryMaps(pid int) (*kernel.ProcMapEntries,
ctx.pidMaps[pid] = &maps
return &maps, nil
}

func (ctx *systemContext) cleanupDataForProcess(pid int) {
delete(ctx.pidMaps, pid)
}

func (ctx *systemContext) cleanupOldEntries() {
maxFatbinAge := 5 * time.Minute
fatbinExpirationTime := time.Now().Add(-maxFatbinAge)

for path, fd := range ctx.fileData {
if fd.lastAccessed.Before(fatbinExpirationTime) {
delete(ctx.fileData, path)
}
}
}
2 changes: 2 additions & 0 deletions pkg/gpu/stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,5 +264,7 @@ func (sh *StreamHandler) markEnd() error {
sh.allocations = append(sh.allocations, &data)
}

sh.sysCtx.cleanupDataForProcess(int(sh.key.Pid))

return nil
}

0 comments on commit aef98c6

Please sign in to comment.