Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add info on the cpu sets to procstats #432

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 141 additions & 47 deletions collectors/0/procstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#
"""import various /proc stats from /proc into TSDB"""

import errno
import os
import re
import sys
Expand All @@ -22,6 +23,13 @@

from collectors.lib import utils

INTERRUPTS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds
SOFTIRQS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds
# Modern Linux:
CPUSET_PATH = "/sys/fs/cgroup/cpuset"
if os.path.isdir("/dev/cpuset"):
# Older Linux:
CPUSET_PATH = "/dev/cpuset"
COLLECTION_INTERVAL = 15 # seconds
NUMADIR = "/sys/devices/system/node"

Expand Down Expand Up @@ -79,6 +87,58 @@ def print_numa_stats(numafiles):
% (ts, stats["interleave_hit"], node_id))
numafile.close()

def expand_numlist(s):
"""return a list of numbers from a list with ranges,
e.g. 4,5-10,14-16"""
r = []
for i in s.split(','):
if '-' not in i:
r.append(int(i))
else:
l,h = map(int, i.split('-'))
r+= range(l,h+1)
return r

def cpus_csets(cpuset_path):
"""Return a hash of cpu_id_as_string->cset_name"""
try:
csets = os.listdir(cpuset_path)
except OSError as e:
if e.errno == errno.ENOENT: # No such file or directory
return {} # We don't have csets
raise

csets = [cset for cset in csets if os.path.isdir(os.path.join(cpuset_path, cset))]

cpu2cset = {}
for cset in csets:
cpuspath = os.path.join(cpuset_path, cset, 'cpuset.cpus')
if not os.path.isfile(cpuspath):
cpuspath = os.path.join(cpuset_path, cset, 'cpus')
if not os.path.isfile(cpuspath):
# No such file?? Ignore csets
sys.stderr.write("No 'cpuset.cpus' or 'cpus' file in %s!" % os.path.join(cpuset_path, cset))
continue

try:
f_cpus = open(cpuspath)
except:
# Ignore that one and continue
sys.stderr.write("Could not open %s!" % cpuspath)
continue

format_errors = 0
for line in f_cpus:
m = re.match('^[-0-9,]+$', line)
if m:
for c in expand_numlist(line):
cpu2cset[str(c)] = cset
else:
format_errors += 1
if format_errors > 0:
sys.stderr.write("%d line(s) of %s were not in the expected format" % (format_errors, cpuspath))

return cpu2cset

def main():
"""procstats main loop"""
Expand Down Expand Up @@ -109,7 +169,9 @@ def main():
numastats = find_sysfs_numa_stats()
utils.drop_privileges()

iteration = -1
while True:
iteration += 1
# proc.uptime
f_uptime.seek(0)
ts = int(time.time())
Expand Down Expand Up @@ -148,6 +210,7 @@ def main():
# proc.stat
f_stat.seek(0)
ts = int(time.time())
cpu2cset = cpus_csets(CPUSET_PATH)
for line in f_stat:
m = re.match("(\w+)\s+(.*)", line)
if not m:
Expand All @@ -156,7 +219,11 @@ def main():
cpu_m = re.match("cpu(\d+)", m.group(1))
if cpu_m:
metric_percpu = '.percpu'
tags = ' cpu=%s' % cpu_m.group(1)
cpu_i = cpu_m.group(1)
if cpu_i in cpu2cset:
tags = ' cpu=%s cpuset=%s' % (cpu_i, cpu2cset[cpu_i])
else:
tags = ' cpu=%s cpuset=none' % cpu_m.group(1)
else:
metric_percpu = ''
tags = ''
Expand Down Expand Up @@ -195,53 +262,13 @@ def main():
for line in f_entropy_avail:
print("proc.kernel.entropy_avail %d %s" % (ts, line.strip()))

f_interrupts.seek(0)
ts = int(time.time())
# Get number of CPUs from description line.
num_cpus = len(f_interrupts.readline().split())
for line in f_interrupts:
cols = line.split()

irq_type = cols[0].rstrip(":")
if irq_type.isalnum():
if irq_type.isdigit():
if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]:
irq_type = cols[-1]
else:
continue # Interrupt type is just a number, ignore.
for i, val in enumerate(cols[1:]):
if i >= num_cpus:
# All values read, remaining cols contain textual
# description
break
if not val.isdigit():
# something is weird, there should only be digit values
sys.stderr.write("Unexpected interrupts value %r in"
" %r: " % (val, cols))
break
print("proc.interrupts %s %s type=%s cpu=%s"
% (ts, val, irq_type, i))

f_softirqs.seek(0)
ts = int(time.time())
# Get number of CPUs from description line.
num_cpus = len(f_softirqs.readline().split())
for line in f_softirqs:
cols = line.split()
# Only get softirqs stats every INTERRUPTS_INT_MULT iterations
if iteration % INTERRUPTS_INTVL_MULT == 0:
print_interrupts(f_interrupts)

irq_type = cols[0].rstrip(":")
for i, val in enumerate(cols[1:]):
if i >= num_cpus:
# All values read, remaining cols contain textual
# description
break
if not val.isdigit():
# something is weird, there should only be digit values
sys.stderr.write("Unexpected softirq value %r in"
" %r: " % (val, cols))
break
print("proc.softirqs %s %s type=%s cpu=%s"
% (ts, val, irq_type, i))
# Only get softirqs stats every SOFTIRQS_INT_MULT iterations
if iteration % SOFTIRQS_INTVL_MULT == 0:
print_irqs(f_softirqs)

print_numa_stats(numastats)

Expand All @@ -268,6 +295,73 @@ def main():
sys.stdout.flush()
time.sleep(COLLECTION_INTERVAL)


def print_interrupts(f_interrupts):
f_interrupts.seek(0)
ts = int(time.time())
# Get number of CPUs from description line.
num_cpus = len(f_interrupts.readline().split())

interrupt_dict = {}
for line in f_interrupts:
cols = line.split()

irq_type = cols[0].rstrip(":")
if irq_type.isalnum():
if irq_type.isdigit():
if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]:
irq_type = cols[-1]
else:
continue # Interrupt type is just a number, ignore.
# Strip the thread number from the irq_type, e.g. eth8-8 -> eth8
m = re.match('^(.*)-\d+$', irq_type)
if m:
irq_type = m.group(1)

for i, val in enumerate(cols[1:]):
if i >= num_cpus:
# All values read, remaining cols contain textual
# description
break
if not val.isdigit():
# something is weird, there should only be digit values
sys.stderr.write("Unexpected interrupts value %r in"
" %r: " % (val, cols))
break
k = "type=%s cpu=%s" % (irq_type, i)
if k in interrupt_dict:
interrupt_dict[k] += int(val)
else:
interrupt_dict[k] = int(val)

for k in interrupt_dict:
print ("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k))


def print_irqs(f_softirqs):
f_softirqs.seek(0)
ts = int(time.time())
# Get number of CPUs from description line.
num_cpus = len(f_softirqs.readline().split())
for line in f_softirqs:
cols = line.split()

irq_type = cols[0].rstrip(":")

for i, val in enumerate(cols[1:]):
if i >= num_cpus:
# All values read, remaining cols contain textual
# description
break
if not val.isdigit():
# something is weird, there should only be digit values
sys.stderr.write("Unexpected softirq value %r in"
" %r: " % (val, cols))
break
print ("proc.softirqs %s %s type=%s cpu=%s"
% (ts, val, irq_type, i))


if __name__ == "__main__":
main()

23 changes: 23 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from stat import S_ISDIR, S_ISREG, ST_MODE
import unittest
import subprocess
import time
import signal

import mocks
import tcollector
Expand Down Expand Up @@ -460,6 +462,27 @@ def test_multi_empty_line_put_cond(self):
self.assertEqual(''.join(stdout), expected)
self.assertEqual(stderr, [])


class CollectorSanityCheckTests(unittest.TestCase):
"""Just make sure you can run a collector without it blowing up."""

def test_procstats(self):
env = os.environ.copy()
if env.get("PYTHONPATH"):
env["PYTHONPATH"] += ":."
else:
env["PYTHONPATH"] = "."
p = subprocess.Popen(["collectors/0/procstats.py"], env=env,
stdout=subprocess.PIPE)
time.sleep(5)
p.terminate()
time.sleep(1)
if p.poll() is None:
p.kill()
self.assertEqual(p.poll(), -signal.SIGTERM)
self.assertIn(b"proc.", p.stdout.read())


if __name__ == '__main__':
import logging
logging.basicConfig()
Expand Down