Skip to content

Commit

Permalink
fix: only use psutil for memory reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
fstagni committed Mar 18, 2024
1 parent ff07c21 commit ea2b445
Showing 1 changed file with 8 additions and 36 deletions.
44 changes: 8 additions & 36 deletions src/DIRAC/WorkloadManagementSystem/JobWrapper/Watchdog.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
########################################################################
# File : Watchdog.py
# Author: Stuart Paterson
########################################################################

""" The Watchdog class is used by the Job Wrapper to resolve and monitor
the system resource consumption. The Watchdog can determine if
a running job is stalled and indicate this to the Job Wrapper.
Expand All @@ -22,7 +17,6 @@
import math
import os
import re
import resource
import socket
import time
from pathlib import Path
Expand Down Expand Up @@ -287,12 +281,14 @@ def _performChecks(self):
self.parameters["LoadAverage"] = []
self.parameters["LoadAverage"].append(loadAvg)

memoryUsed = self.getMemoryUsed()
msg += f"MemUsed: {memoryUsed:.1f} kb "
heartBeatDict["MemoryUsed"] = memoryUsed
result = self.profiler.memoryUsage(withChildren=True)
if not result["OK"]:
self.log.warn("Could not get rss info from profiler", result["Message"])
msg += f"MemUsed: {result['Value']:.1f} kb "
heartBeatDict["MemoryUsed"] = result["Value"]
if "MemoryUsed" not in self.parameters:
self.parameters["MemoryUsed"] = []
self.parameters["MemoryUsed"].append(memoryUsed)
self.parameters["MemoryUsed"].append(result["Value"])

result = self.profiler.vSizeUsage(withChildren=True)
if not result["OK"]:
Expand All @@ -304,16 +300,6 @@ def _performChecks(self):
self.parameters["Vsize"].append(vsize)
msg += f"Job Vsize: {vsize:.1f} kb "

result = self.profiler.memoryUsage(withChildren=True)
if not result["OK"]:
self.log.warn("Could not get rss info from profiler", result["Message"])
else:
rss = result["Value"] * 1024.0
heartBeatDict["RSS"] = rss
self.parameters.setdefault("RSS", [])
self.parameters["RSS"].append(rss)
msg += f"Job RSS: {rss:.1f} kb "

if "DiskSpace" not in self.parameters:
self.parameters["DiskSpace"] = []

Expand Down Expand Up @@ -744,11 +730,6 @@ def calibrate(self):
self.initialValues["LoadAverage"] = float(os.getloadavg()[0])
self.parameters["LoadAverage"] = []

memUsed = self.getMemoryUsed()

self.initialValues["MemoryUsed"] = memUsed
self.parameters["MemoryUsed"] = []

result = self.profiler.vSizeUsage(withChildren=True)
if not result["OK"]:
self.log.warn("Could not get vSize info from profiler", result["Message"])
Expand All @@ -762,9 +743,8 @@ def calibrate(self):
if not result["OK"]:
self.log.warn("Could not get rss info from profiler", result["Message"])
else:
rss = result["Value"] * 1024.0
self.initialValues["RSS"] = rss
self.log.verbose("RSS(kb)", f"{rss:.1f}")
self.initialValues["RSS"] = result["Value"]
self.log.verbose("RSS(mb)", f"{result['Value']:.1f}")
self.parameters["RSS"] = []

# We exclude fuse so that mountpoints can be cleaned up by automount after a period unused
Expand Down Expand Up @@ -968,14 +948,6 @@ def getNodeInformation(self):

return result

#############################################################################
def getMemoryUsed(self):
"""Obtains the memory used."""
mem = (
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
)
return float(mem)

#############################################################################
def getDiskSpace(self, exclude=None):
"""Obtains the available disk space."""
Expand Down

0 comments on commit ea2b445

Please sign in to comment.