From ad1dc68bdb2e23fb32f0bd0f4cf7e829e77cfe05 Mon Sep 17 00:00:00 2001 From: execshred Date: Tue, 9 Apr 2019 10:11:06 -0700 Subject: [PATCH 1/5] Adding info on the CPU sets. --- collectors/0/procstats.py | 158 +++++++++++++++++++++++++++++--------- 1 file changed, 120 insertions(+), 38 deletions(-) diff --git a/collectors/0/procstats.py b/collectors/0/procstats.py index 3f602698..48f1e709 100755 --- a/collectors/0/procstats.py +++ b/collectors/0/procstats.py @@ -22,6 +22,13 @@ from collectors.lib import utils +INTERRUPTS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds +SOFTIRQS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds +#RHEL 7 +CPUSET_PATH = "/sys/fs/cgroup/cpuset" +if os.path.isdir("/dev/cpuset"): + #RHEL 6 + CPUSET_PATH = "/dev/cpuset" COLLECTION_INTERVAL = 15 # seconds NUMADIR = "/sys/devices/system/node" @@ -79,6 +86,58 @@ def print_numa_stats(numafiles): % (ts, stats["interleave_hit"], node_id)) numafile.close() +def expand_numlist(s): + """return a list of numbers from a list with ranges, + e.g. 4,5-10,14-16""" + r = [] + for i in s.split(','): + if '-' not in i: + r.append(int(i)) + else: + l,h = map(int, i.split('-')) + r+= range(l,h+1) + return r + +def cpus_csets(cpuset_path): + """Return a hash of cpu_id_as_string->cset_name""" + try: + csets = os.listdir(cpuset_path) + except OSError (errno, msg): + if errno == 2: # No such file or directory + return {} # We don't have csets + raise + + csets = [cset for cset in csets if os.path.isdir(os.path.join(cpuset_path, cset))] + + cpu2cset = {} + for cset in csets: + cpuspath = os.path.join(cpuset_path, cset, 'cpuset.cpus') + if not os.path.isfile(cpuspath): + cpuspath = os.path.join(cpuset_path, cset, 'cpus') + if not os.path.isfile(cpuspath): + # No such file?? Ignore csets + sys.stderr.write("No 'cpuset.cpus' or 'cpus' file in %s!" % os.path.join(cpuset_path, cset)) + continue + + try: + f_cpus = open(cpuspath) + except: + # Ignore that one and continue + sys.stderr.write("Could not open %s!" % cpuspath) + continue + + format_errors = 0 + for line in f_cpus: + m = re.match('^[-0-9,]+$', line) + if m: + for c in expand_numlist(line): + cpu2cset[str(c)] = cset + else: + format_errors += 1 + if format_errors > 0: + sys.stderr.write("%d line(s) of %s were not in the expected format" % (format_errors, cpuspath)) + + return cpu2cset def main(): """procstats main loop""" @@ -109,7 +168,9 @@ def main(): numastats = find_sysfs_numa_stats() utils.drop_privileges() + iteration = -1 while True: + iteration += 1 # proc.uptime f_uptime.seek(0) ts = int(time.time()) @@ -148,6 +209,7 @@ def main(): # proc.stat f_stat.seek(0) ts = int(time.time()) + cpu2cset = cpus_csets(CPUSET_PATH) for line in f_stat: m = re.match("(\w+)\s+(.*)", line) if not m: @@ -156,7 +218,11 @@ def main(): cpu_m = re.match("cpu(\d+)", m.group(1)) if cpu_m: metric_percpu = '.percpu' - tags = ' cpu=%s' % cpu_m.group(1) + cpu_i = cpu_m.group(1) + if cpu_i in cpu2cset: + tags = ' cpu=%s cpuset=%s' % (cpu_i, cpu2cset[cpu_i]) + else: + tags = ' cpu=%s cpuset=none' % cpu_m.group(1) else: metric_percpu = '' tags = '' @@ -195,33 +261,49 @@ def main(): for line in f_entropy_avail: print("proc.kernel.entropy_avail %d %s" % (ts, line.strip())) - f_interrupts.seek(0) - ts = int(time.time()) - # Get number of CPUs from description line. - num_cpus = len(f_interrupts.readline().split()) - for line in f_interrupts: - cols = line.split() + # Only get softirqs stats every INTERRUPTS_INT_MULT iterations + if iteration % INTERRUPTS_INTVL_MULT == 0: + f_interrupts.seek(0) + ts = int(time.time()) + # Get number of CPUs from description line. + num_cpus = len(f_interrupts.readline().split()) - irq_type = cols[0].rstrip(":") - if irq_type.isalnum(): - if irq_type.isdigit(): - if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]: - irq_type = cols[-1] - else: - continue # Interrupt type is just a number, ignore. - for i, val in enumerate(cols[1:]): - if i >= num_cpus: - # All values read, remaining cols contain textual - # description - break - if not val.isdigit(): - # something is weird, there should only be digit values - sys.stderr.write("Unexpected interrupts value %r in" - " %r: " % (val, cols)) - break - print("proc.interrupts %s %s type=%s cpu=%s" - % (ts, val, irq_type, i)) + interrupt_dict = {} + for line in f_interrupts: + cols = line.split() + + irq_type = cols[0].rstrip(":") + if irq_type.isalnum(): + if irq_type.isdigit(): + if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]: + irq_type = cols[-1] + else: + continue # Interrupt type is just a number, ignore. + # Strip the thread number from the irq_type, e.g. eth8-8 -> eth8 + m = re.match('^(.*)-\d+$', irq_type) + if m: + irq_type = m.group(1) + + for i, val in enumerate(cols[1:]): + if i >= num_cpus: + # All values read, remaining cols contain textual + # description + break + if not val.isdigit(): + # something is weird, there should only be digit values + sys.stderr.write("Unexpected interrupts value %r in" + " %r: " % (val, cols)) + break + k = "type=%s cpu=%s" % (irq_type, i) + if k in interrupt_dict: + interrupt_dict[k] += int(val) + else: + interrupt_dict[k] = int(val) + + for k in interrupt_dict: + print ("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k)) + # Only get softirqs stats every SOFTIRQS_INT_MULT iterations f_softirqs.seek(0) ts = int(time.time()) # Get number of CPUs from description line. @@ -230,18 +312,18 @@ def main(): cols = line.split() irq_type = cols[0].rstrip(":") - for i, val in enumerate(cols[1:]): - if i >= num_cpus: - # All values read, remaining cols contain textual - # description - break - if not val.isdigit(): - # something is weird, there should only be digit values - sys.stderr.write("Unexpected softirq value %r in" - " %r: " % (val, cols)) - break - print("proc.softirqs %s %s type=%s cpu=%s" - % (ts, val, irq_type, i)) + for i, val in enumerate(cols[1:]): + if i >= num_cpus: + # All values read, remaining cols contain textual + # description + break + if not val.isdigit(): + # something is weird, there should only be digit values + sys.stderr.write("Unexpected softirq value %r in" + " %r: " % (val, cols)) + break + print ("proc.softirqs %s %s type=%s cpu=%s" + % (ts, val, irq_type, i)) print_numa_stats(numastats) From 7487c1b632cae16791614e2a13e0b313f46ccf7f Mon Sep 17 00:00:00 2001 From: execshred Date: Sat, 18 May 2019 12:39:16 -0700 Subject: [PATCH 2/5] Updated the code changes from the patch file rather than the diff file. Looks like I could've made a few mistakes in my first branch commit. --- collectors/0/procstats.py | 42 ++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/collectors/0/procstats.py b/collectors/0/procstats.py index 48f1e709..2e4d3f78 100755 --- a/collectors/0/procstats.py +++ b/collectors/0/procstats.py @@ -304,26 +304,28 @@ def main(): print ("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k)) # Only get softirqs stats every SOFTIRQS_INT_MULT iterations - f_softirqs.seek(0) - ts = int(time.time()) - # Get number of CPUs from description line. - num_cpus = len(f_softirqs.readline().split()) - for line in f_softirqs: - cols = line.split() - - irq_type = cols[0].rstrip(":") - for i, val in enumerate(cols[1:]): - if i >= num_cpus: - # All values read, remaining cols contain textual - # description - break - if not val.isdigit(): - # something is weird, there should only be digit values - sys.stderr.write("Unexpected softirq value %r in" - " %r: " % (val, cols)) - break - print ("proc.softirqs %s %s type=%s cpu=%s" - % (ts, val, irq_type, i)) + if iteration % SOFTIRQS_INTVL_MULT == 0: + f_softirqs.seek(0) + ts = int(time.time()) + # Get number of CPUs from description line. + num_cpus = len(f_softirqs.readline().split()) + for line in f_softirqs: + cols = line.split() + + irq_type = cols[0].rstrip(":") + + for i, val in enumerate(cols[1:]): + if i >= num_cpus: + # All values read, remaining cols contain textual + # description + break + if not val.isdigit(): + # something is weird, there should only be digit values + sys.stderr.write("Unexpected softirq value %r in" + " %r: " % (val, cols)) + break + print ("proc.softirqs %s %s type=%s cpu=%s" + % (ts, val, irq_type, i)) print_numa_stats(numastats) From 8ab5742f53b980522339b74ebbee2d24d82317ea Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 13 Sep 2019 13:27:59 -0400 Subject: [PATCH 3/5] Split into multiple functions so it's easier to understand. --- collectors/0/procstats.py | 133 ++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 62 deletions(-) diff --git a/collectors/0/procstats.py b/collectors/0/procstats.py index 2e4d3f78..ba37e712 100755 --- a/collectors/0/procstats.py +++ b/collectors/0/procstats.py @@ -24,10 +24,10 @@ INTERRUPTS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds SOFTIRQS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds -#RHEL 7 +# Modern Linux: CPUSET_PATH = "/sys/fs/cgroup/cpuset" if os.path.isdir("/dev/cpuset"): - #RHEL 6 + # Older Linux: CPUSET_PATH = "/dev/cpuset" COLLECTION_INTERVAL = 15 # seconds NUMADIR = "/sys/devices/system/node" @@ -263,69 +263,11 @@ def main(): # Only get softirqs stats every INTERRUPTS_INT_MULT iterations if iteration % INTERRUPTS_INTVL_MULT == 0: - f_interrupts.seek(0) - ts = int(time.time()) - # Get number of CPUs from description line. - num_cpus = len(f_interrupts.readline().split()) - - interrupt_dict = {} - for line in f_interrupts: - cols = line.split() - - irq_type = cols[0].rstrip(":") - if irq_type.isalnum(): - if irq_type.isdigit(): - if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]: - irq_type = cols[-1] - else: - continue # Interrupt type is just a number, ignore. - # Strip the thread number from the irq_type, e.g. eth8-8 -> eth8 - m = re.match('^(.*)-\d+$', irq_type) - if m: - irq_type = m.group(1) - - for i, val in enumerate(cols[1:]): - if i >= num_cpus: - # All values read, remaining cols contain textual - # description - break - if not val.isdigit(): - # something is weird, there should only be digit values - sys.stderr.write("Unexpected interrupts value %r in" - " %r: " % (val, cols)) - break - k = "type=%s cpu=%s" % (irq_type, i) - if k in interrupt_dict: - interrupt_dict[k] += int(val) - else: - interrupt_dict[k] = int(val) - - for k in interrupt_dict: - print ("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k)) + print_interrupts(f_interrupts) # Only get softirqs stats every SOFTIRQS_INT_MULT iterations if iteration % SOFTIRQS_INTVL_MULT == 0: - f_softirqs.seek(0) - ts = int(time.time()) - # Get number of CPUs from description line. - num_cpus = len(f_softirqs.readline().split()) - for line in f_softirqs: - cols = line.split() - - irq_type = cols[0].rstrip(":") - - for i, val in enumerate(cols[1:]): - if i >= num_cpus: - # All values read, remaining cols contain textual - # description - break - if not val.isdigit(): - # something is weird, there should only be digit values - sys.stderr.write("Unexpected softirq value %r in" - " %r: " % (val, cols)) - break - print ("proc.softirqs %s %s type=%s cpu=%s" - % (ts, val, irq_type, i)) + print_irqs(f_softirqs) print_numa_stats(numastats) @@ -352,6 +294,73 @@ def main(): sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) + +def print_interrupts(f_interrupts): + f_interrupts.seek(0) + ts = int(time.time()) + # Get number of CPUs from description line. + num_cpus = len(f_interrupts.readline().split()) + + interrupt_dict = {} + for line in f_interrupts: + cols = line.split() + + irq_type = cols[0].rstrip(":") + if irq_type.isalnum(): + if irq_type.isdigit(): + if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]: + irq_type = cols[-1] + else: + continue # Interrupt type is just a number, ignore. + # Strip the thread number from the irq_type, e.g. eth8-8 -> eth8 + m = re.match('^(.*)-\d+$', irq_type) + if m: + irq_type = m.group(1) + + for i, val in enumerate(cols[1:]): + if i >= num_cpus: + # All values read, remaining cols contain textual + # description + break + if not val.isdigit(): + # something is weird, there should only be digit values + sys.stderr.write("Unexpected interrupts value %r in" + " %r: " % (val, cols)) + break + k = "type=%s cpu=%s" % (irq_type, i) + if k in interrupt_dict: + interrupt_dict[k] += int(val) + else: + interrupt_dict[k] = int(val) + + for k in interrupt_dict: + print ("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k)) + + +def print_irqs(f_softirqs): + f_softirqs.seek(0) + ts = int(time.time()) + # Get number of CPUs from description line. + num_cpus = len(f_softirqs.readline().split()) + for line in f_softirqs: + cols = line.split() + + irq_type = cols[0].rstrip(":") + + for i, val in enumerate(cols[1:]): + if i >= num_cpus: + # All values read, remaining cols contain textual + # description + break + if not val.isdigit(): + # something is weird, there should only be digit values + sys.stderr.write("Unexpected softirq value %r in" + " %r: " % (val, cols)) + break + print ("proc.softirqs %s %s type=%s cpu=%s" + % (ts, val, irq_type, i)) + + if __name__ == "__main__": main() From 2b0086c80c75434febdeb61c3aab4ac7c82b767c Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 13 Sep 2019 13:41:50 -0400 Subject: [PATCH 4/5] More portable way of saying this. --- collectors/0/procstats.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/collectors/0/procstats.py b/collectors/0/procstats.py index ba37e712..0dd8e042 100755 --- a/collectors/0/procstats.py +++ b/collectors/0/procstats.py @@ -14,6 +14,7 @@ # """import various /proc stats from /proc into TSDB""" +import errno import os import re import sys @@ -102,8 +103,8 @@ def cpus_csets(cpuset_path): """Return a hash of cpu_id_as_string->cset_name""" try: csets = os.listdir(cpuset_path) - except OSError (errno, msg): - if errno == 2: # No such file or directory + except OSError as e: + if e.errno == errno.ENOENT: # No such file or directory return {} # We don't have csets raise From 73dd9f87d30fada399a2a6c6593b6c6e2d6cac0c Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 13 Sep 2019 13:51:06 -0400 Subject: [PATCH 5/5] Sanity check for procstats.py. --- tests.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests.py b/tests.py index 1348d8ee..8b92d0bf 100755 --- a/tests.py +++ b/tests.py @@ -16,6 +16,9 @@ import sys from stat import S_ISDIR, S_ISREG, ST_MODE import unittest +import subprocess +import time +import signal import mocks import tcollector @@ -323,6 +326,27 @@ def test_multi_empty_line_put_cond(self): self.assertEqual(''.join(stdout), expected) self.assertEqual(stderr, []) + +class CollectorSanityCheckTests(unittest.TestCase): + """Just make sure you can run a collector without it blowing up.""" + + def test_procstats(self): + env = os.environ.copy() + if env.get("PYTHONPATH"): + env["PYTHONPATH"] += ":." + else: + env["PYTHONPATH"] = "." + p = subprocess.Popen(["collectors/0/procstats.py"], env=env, + stdout=subprocess.PIPE) + time.sleep(5) + p.terminate() + time.sleep(1) + if p.poll() is None: + p.kill() + self.assertEqual(p.poll(), -signal.SIGTERM) + self.assertIn(b"proc.", p.stdout.read()) + + if __name__ == '__main__': cdir = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'collectors')