From 642271cf739020ad2ab816a46872cbf8d4299bc1 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Thu, 22 Feb 2018 16:29:56 +0530 Subject: [PATCH 1/8] Python 3 compatibility --- .travis.yml | 3 + collectors/0/check_certs.py | 9 +- collectors/0/couchbase.py | 279 +++++----- collectors/0/dfstat.py | 226 ++++----- collectors/0/docker.py | 98 ++-- collectors/0/elasticsearch.py | 313 ++++++------ collectors/0/flume.py | 158 +++--- collectors/0/g1gc.py | 132 +++-- collectors/0/graphite_bridge.py | 28 +- collectors/0/gstat.py | 60 ++- collectors/0/hadoop_datanode.py | 4 +- collectors/0/hadoop_namenode.py | 4 +- collectors/0/haproxy.py | 101 ++-- collectors/0/hbase_regionserver.py | 9 +- collectors/0/ifrate.py | 64 +-- collectors/0/ifstat.py | 7 +- collectors/0/iostat.py | 58 ++- collectors/0/jolokia.py | 18 +- collectors/0/memcache.py | 298 +++++------ collectors/0/mongo.py | 17 +- collectors/0/mongo3.py | 19 +- collectors/0/mountstats.py | 34 +- collectors/0/mysql.py | 676 +++++++++++++------------ collectors/0/netstat.py | 40 +- collectors/0/nfsstat.py | 22 +- collectors/0/ntpstat.py | 26 +- collectors/0/postgresql.py | 195 +++---- collectors/0/procnettcp.py | 29 +- collectors/0/procstats.py | 103 ++-- collectors/0/pxc-collector.py | 90 ++-- collectors/0/redis_stats.py | 9 +- collectors/0/riak.py | 8 +- collectors/0/smart_stats.py | 345 +++++++------ collectors/0/sysload.py | 306 +++++------ collectors/0/tcollector.py | 20 +- collectors/0/tcp_bridge.py | 18 +- collectors/0/udp_bridge.py | 16 +- collectors/0/varnishstat.py | 87 ++-- collectors/0/zabbix_bridge.py | 16 +- collectors/0/zfsiostats.py | 109 ++-- collectors/0/zfsolkernstats.py | 19 +- collectors/0/zookeeper.py | 25 +- collectors/300/aws_cloudwatch_stats.py | 95 ++-- collectors/900/zabbix_bridge_cache.py | 11 +- collectors/etc/metric_naming.py | 6 +- collectors/etc/mysqlconf.py | 6 +- collectors/lib/hadoop_http.py | 26 +- collectors/lib/utils.py | 11 +- eos/collectors/eos.py | 317 ++++++------ eos/tcollector_agent.py | 568 ++++++++++----------- grok_scraper.py | 5 +- pylint-runner.py | 43 +- tcollector.py | 103 ++-- tests.py | 29 +- 54 files changed, 2783 insertions(+), 2535 deletions(-) diff --git a/.travis.yml b/.travis.yml index d3d5762e..cfcd9fd7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,9 @@ language: python python: - "2.6" - "2.7" + - "3.4" + - "3.5" + - "3.6" install: - sudo apt-get -qq update diff --git a/collectors/0/check_certs.py b/collectors/0/check_certs.py index 28114caa..9234d33e 100755 --- a/collectors/0/check_certs.py +++ b/collectors/0/check_certs.py @@ -6,8 +6,7 @@ import time from datetime import datetime -import ssllabs - +from collectors.lib import ssllabs from collectors.etc import check_certs_conf logging.basicConfig(stream=sys.stdout) @@ -1000,7 +999,7 @@ def main(): try: name_vs_domains = check_certs_conf.get_config() timestamp = int(time.time()) - for name, domain in name_vs_domains.iteritems(): + for name, domain in name_vs_domains.items(): assessment = ssllabs.SSLLabsAssessment(host=domain) info = assessment.analyze(ignore_mismatch='off', from_cache='off', publish='off') j = json.loads(json.dumps(info)) @@ -1010,7 +1009,7 @@ def main(): # print endpoint['gradeTrustIgnored'] # print endpoint['hasWarnings'] # print endpoint['ipAddress'] - expiry_time = long(endpoint['details']['cert']['notAfter']) + expiry_time = int(endpoint['details']['cert']['notAfter']) expiry_time_sec = expiry_time / 1000 nowtime = datetime.utcnow() @@ -1028,4 +1027,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/collectors/0/couchbase.py b/collectors/0/couchbase.py index 1e5d71ba..993ea196 100755 --- a/collectors/0/couchbase.py +++ b/collectors/0/couchbase.py @@ -21,154 +21,161 @@ COLLECTION_INTERVAL = CONFIG['collection_interval'] COUCHBASE_INITFILE = CONFIG['couchbase_initfile'] -KEYS = frozenset( [ - 'bucket_active_conns', - 'cas_hits', - 'cas_misses', - 'cmd_get', - 'cmd_set', - 'curr_connections', - 'curr_conns_on_port_11209', - 'curr_conns_on_port_11210', - 'ep_queue_size', - 'ep_num_value_ejects', - 'ep_num_eject_failures', - 'ep_oom_errors', - 'ep_tmp_oom_errors', - 'get_hits', - 'get_misses', - 'mem_used', - 'total_connections', - 'total_heap_bytes', - 'total_free_bytes', - 'total_allocated_bytes', - 'total_fragmentation_bytes', - 'tcmalloc_current_thread_cache_bytes', - 'tcmalloc_max_thread_cache_bytes', - 'tcmalloc_unmapped_bytes', - ] ) +KEYS = frozenset([ + 'bucket_active_conns', + 'cas_hits', + 'cas_misses', + 'cmd_get', + 'cmd_set', + 'curr_connections', + 'curr_conns_on_port_11209', + 'curr_conns_on_port_11210', + 'ep_queue_size', + 'ep_num_value_ejects', + 'ep_num_eject_failures', + 'ep_oom_errors', + 'ep_tmp_oom_errors', + 'get_hits', + 'get_misses', + 'mem_used', + 'total_connections', + 'total_heap_bytes', + 'total_free_bytes', + 'total_allocated_bytes', + 'total_fragmentation_bytes', + 'tcmalloc_current_thread_cache_bytes', + 'tcmalloc_max_thread_cache_bytes', + 'tcmalloc_unmapped_bytes', +]) + def find_couchbase_pid(): - """Find out the pid of couchbase""" - if not os.path.isfile(COUCHBASE_INITFILE): - return - - try: - fd = open(COUCHBASE_INITFILE) - for line in fd: - if line.startswith("exec"): - init_script = line.split()[1] - fd.close() - except IOError: - utils.err("Check permission of file (%s)" % COUCHBASE_INITFILE) - return - - try: - fd = open(init_script) - for line in fd: - if line.startswith("PIDFILE"): - pid_file = line.split("=")[1].rsplit()[0] - fd.close() - except IOError: - utils.err("Check permission of file (%s)" % init_script) - return - - try: - fd = open(pid_file) - pid = fd.read() - fd.close() - except IOError: - utils.err("Couchbase-server is not running, since no pid file exists") - return - - return pid.split()[0] + """Find out the pid of couchbase""" + if not os.path.isfile(COUCHBASE_INITFILE): + return + + try: + fd = open(COUCHBASE_INITFILE) + for line in fd: + if line.startswith("exec"): + init_script = line.split()[1] + fd.close() + except IOError: + utils.err("Check permission of file (%s)" % COUCHBASE_INITFILE) + return + + try: + fd = open(init_script) + for line in fd: + if line.startswith("PIDFILE"): + pid_file = line.split("=")[1].rsplit()[0] + fd.close() + except IOError: + utils.err("Check permission of file (%s)" % init_script) + return + + try: + fd = open(pid_file) + pid = fd.read() + fd.close() + except IOError: + utils.err("Couchbase-server is not running, since no pid file exists") + return + + return pid.split()[0] + def find_conf_file(pid): - """Returns config file for couchbase-server.""" - try: - fd = open('/proc/%s/cmdline' % pid) - except IOError, e: - utils.err("Couchbase (pid %s) went away ? %s" % (pid, e)) - return - try: - config = fd.read().split("config_path")[1].split("\"")[1] - return config - finally: - fd.close() + """Returns config file for couchbase-server.""" + try: + fd = open('/proc/%s/cmdline' % pid) + except IOError as e: + utils.err("Couchbase (pid %s) went away ? %s" % (pid, e)) + return + try: + config = fd.read().split("config_path")[1].split("\"")[1] + return config + finally: + fd.close() + def find_bindir_path(config_file): - """Returns the bin directory path""" - try: - fd = open(config_file) - except IOError, e: - utils.err("Error for Config file (%s): %s" % (config_file, e)) - return None - try: - for line in fd: - if line.startswith("{path_config_bindir"): - return line.split(",")[1].split("\"")[1] - finally: - fd.close() + """Returns the bin directory path""" + try: + fd = open(config_file) + except IOError as e: + utils.err("Error for Config file (%s): %s" % (config_file, e)) + return None + try: + for line in fd: + if line.startswith("{path_config_bindir"): + return line.split(",")[1].split("\"")[1] + finally: + fd.close() + def list_bucket(bin_dir): - """Returns the list of memcached or membase buckets""" - buckets = [] - if not os.path.isfile("%s/couchbase-cli" % bin_dir): + """Returns the list of memcached or membase buckets""" + buckets = [] + if not os.path.isfile("%s/couchbase-cli" % bin_dir): + return buckets + cli = ("%s/couchbase-cli" % bin_dir) + try: + buck = subprocess.check_output([cli, "bucket-list", "--cluster", + "localhost:8091"]) + except subprocess.CalledProcessError: + return buckets + regex = re.compile("[\s\w]+:[\s\w]+$") + for i in buck.splitlines(): + if not regex.match(i): + buckets.append(i) return buckets - cli = ("%s/couchbase-cli" % bin_dir) - try: - buck = subprocess.check_output([cli, "bucket-list", "--cluster", - "localhost:8091"]) - except subprocess.CalledProcessError: - return buckets - regex = re.compile("[\s\w]+:[\s\w]+$") - for i in buck.splitlines(): - if not regex.match(i): - buckets.append(i) - return buckets + def collect_stats(bin_dir, bucket): - """Returns statistics related to a particular bucket""" - if not os.path.isfile("%s/cbstats" % bin_dir): - return - cli = ("%s/cbstats" % bin_dir) - try: - ts = time.time() - stats = subprocess.check_output([cli, "localhost:11211", "-b", bucket, - "all"]) - except subprocess.CalledProcessError: - return - for stat in stats.splitlines(): - metric = stat.split(":")[0].lstrip(" ") - value = stat.split(":")[1].lstrip(" \t") - if metric in KEYS: - print ("couchbase.%s %i %s bucket=%s" % (metric, ts, value, bucket)) + """Returns statistics related to a particular bucket""" + if not os.path.isfile("%s/cbstats" % bin_dir): + return + cli = ("%s/cbstats" % bin_dir) + try: + ts = time.time() + stats = subprocess.check_output([cli, "localhost:11211", "-b", bucket, + "all"]) + except subprocess.CalledProcessError: + return + for stat in stats.splitlines(): + metric = stat.split(b":")[0].lstrip(b" ") + value = stat.split(b":")[1].lstrip(b" \t") + if metric in KEYS: + print("couchbase.%s %i %s bucket=%s" % (metric, ts, value, bucket)) + def main(): - utils.drop_privileges() - pid = find_couchbase_pid() - if not pid: - utils.err("Error: Either couchbase-server is not running or file (%s)" - " doesn't exist" % COUCHBASE_INITFILE) - return 13 - - conf_file = find_conf_file(pid) - if not conf_file: - utils.err("Error: Can't find config file (%s)" % conf_file) - return 13 - - bin_dir = find_bindir_path(conf_file) - if not bin_dir: - utils.err("Error: Can't find bindir path in config file") - return 13 - - while True: - # Listing bucket everytime so as to start collecting datapoints - # of any new bucket. - buckets = list_bucket(bin_dir) - for b in buckets: - collect_stats(bin_dir, b) - time.sleep(COLLECTION_INTERVAL) + utils.drop_privileges() + pid = find_couchbase_pid() + if not pid: + utils.err("Error: Either couchbase-server is not running or file (%s)" + " doesn't exist" % COUCHBASE_INITFILE) + return 13 + + conf_file = find_conf_file(pid) + if not conf_file: + utils.err("Error: Can't find config file (%s)" % conf_file) + return 13 + + bin_dir = find_bindir_path(conf_file) + if not bin_dir: + utils.err("Error: Can't find bindir path in config file") + return 13 + + while True: + # Listing bucket everytime so as to start collecting datapoints + # of any new bucket. + buckets = list_bucket(bin_dir) + for b in buckets: + collect_stats(bin_dir, b) + time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/dfstat.py b/collectors/0/dfstat.py index e7f32876..74adf0bc 100755 --- a/collectors/0/dfstat.py +++ b/collectors/0/dfstat.py @@ -41,125 +41,125 @@ # File system types to ignore FSTYPE_IGNORE = frozenset([ - "cgroup", - "debugfs", - "devtmpfs", - "nfs", - "rpc_pipefs", - "rootfs", + "cgroup", + "debugfs", + "devtmpfs", + "nfs", + "rpc_pipefs", + "rootfs", ]) METRIC_MAPPING = yaml_conf.load_collector_configuration('node_metrics.yml') + def main(): - """dfstats main loop""" - try: - f_mounts = open("/proc/mounts", "r") - except IOError, e: - utils.err("error: can't open /proc/mounts: %s" % e) - return 13 # Ask tcollector to not respawn us - - utils.drop_privileges() - - while True: - devices = [] - f_mounts.seek(0) - ts = int(time.time()) - - for line in f_mounts: - # Docs come from the fstab(5) - # fs_spec # Mounted block special device or remote filesystem - # fs_file # Mount point - # fs_vfstype # File system type - # fs_mntops # Mount options - # fs_freq # Dump(8) utility flags - # fs_passno # Order in which filesystem checks are done at reboot time - try: - fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = line.split(None) - except ValueError, e: - utils.err("error: can't parse line at /proc/mounts: %s" % e) - continue - - if fs_spec == "none": - continue - elif fs_vfstype in FSTYPE_IGNORE or fs_vfstype.startswith("fuse."): - continue - # startswith(tuple) avoided to preserve support of Python 2.4 - elif fs_file.startswith("/dev") or fs_file.startswith("/sys") or \ - fs_file.startswith("/proc") or fs_file.startswith("/lib") or \ - fs_file.startswith("net:"): - continue - - # keep /dev/xxx device with shorter fs_file (remove mount binds) - device_found = False - if fs_spec.startswith("/dev"): + """dfstats main loop""" + try: + f_mounts = open("/proc/mounts", "r") + except IOError as e: + utils.err("error: can't open /proc/mounts: %s" % e) + return 13 # Ask tcollector to not respawn us + + utils.drop_privileges() + + while True: + devices = [] + f_mounts.seek(0) + ts = int(time.time()) + + for line in f_mounts: + # Docs come from the fstab(5) + # fs_spec # Mounted block special device or remote filesystem + # fs_file # Mount point + # fs_vfstype # File system type + # fs_mntops # Mount options + # fs_freq # Dump(8) utility flags + # fs_passno # Order in which filesystem checks are done at reboot time + try: + fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = line.split(None) + except ValueError as e: + utils.err("error: can't parse line at /proc/mounts: %s" % e) + continue + + if fs_spec == "none": + continue + elif fs_vfstype in FSTYPE_IGNORE or fs_vfstype.startswith("fuse."): + continue + # startswith(tuple) avoided to preserve support of Python 2.4 + elif fs_file.startswith("/dev") or fs_file.startswith("/sys") or \ + fs_file.startswith("/proc") or fs_file.startswith("/lib") or \ + fs_file.startswith("net:"): + continue + + # keep /dev/xxx device with shorter fs_file (remove mount binds) + device_found = False + if fs_spec.startswith("/dev"): + for device in devices: + if fs_spec == device[0]: + device_found = True + if len(fs_file) < len(device[1]): + device[1] = fs_file + break + if not device_found: + devices.append([fs_spec, fs_file, fs_vfstype]) + else: + devices.append([fs_spec, fs_file, fs_vfstype]) + for device in devices: - if fs_spec == device[0]: - device_found = True - if len(fs_file) < len(device[1]): - device[1] = fs_file - break - if not device_found: - devices.append([fs_spec, fs_file, fs_vfstype]) - else: - devices.append([fs_spec, fs_file, fs_vfstype]) - - - for device in devices: - fs_spec, fs_file, fs_vfstype = device - try: - r = os.statvfs(fs_file) - except OSError, e: - utils.err("can't get info for mount point: %s: %s" % (fs_file, e)) - continue - - used = r.f_blocks - r.f_bfree - - # conditional expression avoided to preserve support of Python 2.4 - # percent_used = 100 if r.f_blocks == 0 else used * 100.0 / r.f_blocks - if r.f_blocks == 0: - percent_used = 100 - else: - percent_used = used * 100.0 / r.f_blocks - - print("df.bytes.total %d %s mount=%s fstype=%s" - % (ts, r.f_frsize * r.f_blocks, fs_file, fs_vfstype)) - bused = r.f_frsize * used - print("df.bytes.used %d %s mount=%s fstype=%s" - % (ts, bused, fs_file, fs_vfstype)) - metric_naming.print_if_apptuit_standard_metric("df.bytes.used", METRIC_MAPPING, ts, bused, - tags={"mount": fs_file, "fstype": fs_vfstype}, tags_str=None) - print("df.bytes.percentused %d %s mount=%s fstype=%s" - % (ts, percent_used, fs_file, fs_vfstype)) - metric_naming.print_if_apptuit_standard_metric("df.bytes.percentused", METRIC_MAPPING, ts, percent_used, - tags={"mount": fs_file, "fstype": fs_vfstype}, tags_str=None) - bfree = r.f_frsize * r.f_bfree - print("df.bytes.free %d %s mount=%s fstype=%s" - % (ts, bfree, fs_file, fs_vfstype)) - metric_naming.print_if_apptuit_standard_metric("df.bytes.free", METRIC_MAPPING, ts, bfree, - tags={"mount": fs_file, "fstype": fs_vfstype}, tags_str=None) - - used = r.f_files - r.f_ffree - - # percent_used = 100 if r.f_files == 0 else used * 100.0 / r.f_files - if r.f_files == 0: - percent_used = 100 - else: - percent_used = used * 100.0 / r.f_files - - print("df.inodes.total %d %s mount=%s fstype=%s" - % (ts, r.f_files, fs_file, fs_vfstype)) - print("df.inodes.used %d %s mount=%s fstype=%s" - % (ts, used, fs_file, fs_vfstype)) - print("df.inodes.percentused %d %s mount=%s fstype=%s" - % (ts, percent_used, fs_file, fs_vfstype)) - print("df.inodes.free %d %s mount=%s fstype=%s" - % (ts, r.f_ffree, fs_file, fs_vfstype)) - - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) + fs_spec, fs_file, fs_vfstype = device + try: + r = os.statvfs(fs_file) + except OSError as e: + utils.err("can't get info for mount point: %s: %s" % (fs_file, e)) + continue + + used = r.f_blocks - r.f_bfree + + # conditional expression avoided to preserve support of Python 2.4 + # percent_used = 100 if r.f_blocks == 0 else used * 100.0 / r.f_blocks + if r.f_blocks == 0: + percent_used = 100 + else: + percent_used = used * 100.0 / r.f_blocks + + print("df.bytes.total %d %s mount=%s fstype=%s" + % (ts, r.f_frsize * r.f_blocks, fs_file, fs_vfstype)) + bused = r.f_frsize * used + print("df.bytes.used %d %s mount=%s fstype=%s" + % (ts, bused, fs_file, fs_vfstype)) + metric_naming.print_if_apptuit_standard_metric("df.bytes.used", METRIC_MAPPING, ts, bused, + tags={"mount": fs_file, "fstype": fs_vfstype}, tags_str=None) + print("df.bytes.percentused %d %s mount=%s fstype=%s" + % (ts, percent_used, fs_file, fs_vfstype)) + metric_naming.print_if_apptuit_standard_metric("df.bytes.percentused", METRIC_MAPPING, ts, percent_used, + tags={"mount": fs_file, "fstype": fs_vfstype}, tags_str=None) + bfree = r.f_frsize * r.f_bfree + print("df.bytes.free %d %s mount=%s fstype=%s" + % (ts, bfree, fs_file, fs_vfstype)) + metric_naming.print_if_apptuit_standard_metric("df.bytes.free", METRIC_MAPPING, ts, bfree, + tags={"mount": fs_file, "fstype": fs_vfstype}, tags_str=None) + + used = r.f_files - r.f_ffree + + # percent_used = 100 if r.f_files == 0 else used * 100.0 / r.f_files + if r.f_files == 0: + percent_used = 100 + else: + percent_used = used * 100.0 / r.f_files + + print("df.inodes.total %d %s mount=%s fstype=%s" + % (ts, r.f_files, fs_file, fs_vfstype)) + print("df.inodes.used %d %s mount=%s fstype=%s" + % (ts, used, fs_file, fs_vfstype)) + print("df.inodes.percentused %d %s mount=%s fstype=%s" + % (ts, percent_used, fs_file, fs_vfstype)) + print("df.inodes.free %d %s mount=%s fstype=%s" + % (ts, r.f_ffree, fs_file, fs_vfstype)) + + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main()) + sys.stdin.close() + sys.exit(main()) diff --git a/collectors/0/docker.py b/collectors/0/docker.py index 8a744c44..375676a2 100755 --- a/collectors/0/docker.py +++ b/collectors/0/docker.py @@ -15,13 +15,13 @@ CONFIG = docker_conf.get_config() COLLECTION_INTERVAL = CONFIG['interval'] -CGROUP_PATH =CONFIG['cgroup_path'] +CGROUP_PATH = CONFIG['cgroup_path'] ENABLED = docker_conf.enabled() DOCKER_SOCK = CONFIG['socket_path'] if not ENABLED: - sys.stderr.write("Docker collector is not enabled") - sys.exit(13) + sys.stderr.write("Docker collector is not enabled") + sys.exit(13) # proc_names example: # $ cat cpuacct.stat @@ -29,7 +29,7 @@ # system 72 proc_names = { "cpuacct.stat": ( - "user", "system", + "user", "system", ), "memory.stat": ( "cache", "rss", "mapped_file", "pgfault", "pgmajfault", "swap", "active_anon", @@ -52,15 +52,15 @@ ), } -def getnameandimage(containerid): +def getnameandimage(containerid): # Retrieve container json configuration file sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.settimeout(5) try: r = sock.connect_ex(DOCKER_SOCK) if (r != 0): - print >>sys.stderr, "Can not connect to %s" % (DOCKER_SOCK) + utils.err("Can not connect to %s" % (DOCKER_SOCK)) else: message = 'GET /containers/' + containerid + '/json HTTP/1.1\n\n' sock.sendall(message) @@ -73,52 +73,53 @@ def getnameandimage(containerid): # Retrieve container name and image m = re.search("{(.+)}", json_data) if m: - json_data = "{"+m.group(1)+"}" + json_data = "{" + m.group(1) + "}" try: data = json.loads(json_data) try: containernames[containerid] = data["Name"].lstrip('/') except: - print >>sys.stderr, containerid+" has no Name field" + utils.err(containerid + " has no Name field") try: containerimages[containerid] = data["Config"]["Image"].replace(':', '_') except: - print >>sys.stderr, containerid+" has no Image field" + utils.err(containerid + " has no Image field") except: - print >>sys.stderr, "Can not load json" + utils.err("Can not load json") + + except socket.timeout as e: + utils.err("Socket: %s" % e) - except socket.timeout, e: - print >>sys.stderr, "Socket: %s" % (e,) def senddata(datatosend, containerid): if datatosend: - datatosend += " containerid="+containerid + datatosend += " containerid=" + containerid if (containerid in containernames): - datatosend += " containername="+containernames[containerid] + datatosend += " containername=" + containernames[containerid] if (containerid in containerimages): - datatosend += " containerimage="+containerimages[containerid] - print "docker.%s" % datatosend + datatosend += " containerimage=" + containerimages[containerid] + print("docker.%s" % datatosend) sys.stdout.flush() -def readdockerstats(path, containerid): +def readdockerstats(path, containerid): # update containername and containerimage if needed if ((containerid not in containernames) or (containerid not in containerimages)): getnameandimage(containerid) # Retrieve and push stats for file_stat in os.listdir(path): - if (os.path.isfile(path+"/"+file_stat)\ - and ((file_stat in proc_names.keys()) or (file_stat in proc_names_to_agg.keys()))): + if (os.path.isfile(path + "/" + file_stat) \ + and ((file_stat in list(proc_names.keys())) or (file_stat in list(proc_names_to_agg.keys())))): try: - f_stat = open(path+"/"+file_stat) - except IOError, e: - print >>sys.stderr, "Failed to open input file: %s" % (e,) + f_stat = open(path + "/" + file_stat) + except IOError as e: + utils.err("Failed to open input file: %s" % e) return 1 ts = int(time.time()) # proc_name - if (file_stat in proc_names.keys()): + if (file_stat in list(proc_names.keys())): datatosend = None f_stat.seek(0) for line in f_stat: @@ -131,21 +132,21 @@ def readdockerstats(path, containerid): if subcategory in proc_names[file_stat]: if category == 'memory': if subcategory in ['active_anon', 'inactive_anon']: - subcattype = subcategory.split('_')[0] - subcategory = 'anon' + subcattype = subcategory.split('_')[0] + subcategory = 'anon' if subcategory in ['active_file', 'inactive_file']: - subcattype = subcategory.split('_')[0] - subcategory = 'file' + subcattype = subcategory.split('_')[0] + subcategory = 'file' tags = "type=%s" % subcategory if subcattype != None: - tags += " subtype=%s" % subcattype + tags += " subtype=%s" % subcattype datatosend = "%s %d %s %s" % (category, ts, value, tags) else: datatosend = "%s.%s %d %s" % (category, subcategory, ts, value) senddata(datatosend, containerid) # proc_names_to_agg else: - if (file_stat in proc_names_to_agg.keys()): + if (file_stat in list(proc_names_to_agg.keys())): for field_to_match in proc_names_to_agg[file_stat]: datatosend = None f_stat.seek(0) @@ -162,48 +163,53 @@ def readdockerstats(path, containerid): senddata("%s %d %s" % (datatosend, ts, count), containerid) f_stat.close() + def main(): """docker_cpu main loop""" global containernames global containerimages utils.drop_privileges() - cache=0 + cache = 0 while True: # Connect to Docker socket to get informations about containers every 4 times if (cache == 0): - containernames={} - containerimages={} + containernames = {} + containerimages = {} cache += 1 if (cache == 4): cache = 0 if os.path.isdir(CGROUP_PATH): for level1 in os.listdir(CGROUP_PATH): - if (os.path.isdir(CGROUP_PATH + "/"+level1+"/docker")\ - # /cgroup/cpu and /cgroup/cpuacct are often links to /cgroup/cpu,cpuacct - and not (((level1 == "cpu,cpuacct") or (level1 == "cpuacct")) and (os.path.isdir(CGROUP_PATH + "/cpu/docker")))): - for level2 in os.listdir(CGROUP_PATH + "/"+level1+"/docker"): - if os.path.isdir(CGROUP_PATH + "/"+level1+"/docker/"+level2): - readdockerstats(CGROUP_PATH + "/"+level1+"/docker/"+level2, level2) + if (os.path.isdir(CGROUP_PATH + "/" + level1 + "/docker") \ + # /cgroup/cpu and /cgroup/cpuacct are often links to /cgroup/cpu,cpuacct + and not (((level1 == "cpu,cpuacct") or (level1 == "cpuacct")) and ( + os.path.isdir(CGROUP_PATH + "/cpu/docker")))): + for level2 in os.listdir(CGROUP_PATH + "/" + level1 + "/docker"): + if os.path.isdir(CGROUP_PATH + "/" + level1 + "/docker/" + level2): + readdockerstats(CGROUP_PATH + "/" + level1 + "/docker/" + level2, level2) else: # If Docker cgroup is handled by slice # http://www.freedesktop.org/software/systemd/man/systemd.slice.html for slicename in ("system.slice", "machine.slice", "user.slice"): - if (os.path.isdir(CGROUP_PATH + "/"+level1+"/"+slicename)\ - # /cgroup/cpu and /cgroup/cpuacct are often links to /cgroup/cpu,cpuacct - and not (((level1 == "cpu,cpuacct") or (level1 == "cpuacct")) and (os.path.isdir(CGROUP_PATH + "/cpu/"+slicename)))): - for level2 in os.listdir(CGROUP_PATH + "/"+level1+"/"+slicename): - if os.path.isdir(CGROUP_PATH + "/"+level1+"/"+slicename+"/"+level2): + if (os.path.isdir(CGROUP_PATH + "/" + level1 + "/" + slicename) \ + # /cgroup/cpu and /cgroup/cpuacct are often links to /cgroup/cpu,cpuacct + and not (((level1 == "cpu,cpuacct") or (level1 == "cpuacct")) and ( + os.path.isdir(CGROUP_PATH + "/cpu/" + slicename)))): + for level2 in os.listdir(CGROUP_PATH + "/" + level1 + "/" + slicename): + if os.path.isdir(CGROUP_PATH + "/" + level1 + "/" + slicename + "/" + level2): m = re.search("^docker-(\w+)\.scope$", level2) if m: - readdockerstats(CGROUP_PATH + "/"+level1+"/"+slicename+"/"+level2, m.group(1)) + readdockerstats(CGROUP_PATH + "/" + level1 + "/" + slicename + "/" + level2, + m.group(1)) break if os.path.isdir(CGROUP_PATH + "/lxc"): for level1 in os.listdir(CGROUP_PATH + "/lxc"): - if os.path.isdir(CGROUP_PATH + "/lxc/"+level1): - readdockerstats(CGROUP_PATH + "/lxc/"+level1, level1) + if os.path.isdir(CGROUP_PATH + "/lxc/" + level1): + readdockerstats(CGROUP_PATH + "/lxc/" + level1, level1) time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/elasticsearch.py b/collectors/0/elasticsearch.py index c5fff9c4..cab8ef7d 100755 --- a/collectors/0/elasticsearch.py +++ b/collectors/0/elasticsearch.py @@ -16,214 +16,225 @@ # Tested with ES 0.16.5, 0.17.x, 0.90.1 . import errno -import httplib -try: - import json -except ImportError: - json = None # Handled gracefully in main. Not available by default in <2.6 import socket import sys import threading import time import re +is_py2 = sys.version[0] == '2' +if is_py2: + import httplib as httplib +else: + import http.client as httplib + +try: + import json +except ImportError: + json = None # Handled gracefully in main. Not available by default in <2.6 + from collectors.lib import utils from collectors.etc import elasticsearch_conf - COLLECTION_INTERVAL = 15 # seconds -DEFAULT_TIMEOUT = 10.0 # seconds +DEFAULT_TIMEOUT = 10.0 # seconds # regexes to separate differences in version numbers PRE_VER1 = re.compile(r'^0\.') VER1 = re.compile(r'^1\.') STATUS_MAP = { - "green": 0, - "yellow": 1, - "red": 2, + "green": 0, + "yellow": 1, + "red": 2, } class ESError(RuntimeError): - """Exception raised if we don't get a 200 OK from ElasticSearch.""" + """Exception raised if we don't get a 200 OK from ElasticSearch.""" - def __init__(self, resp): - RuntimeError.__init__(self, str(resp)) - self.resp = resp + def __init__(self, resp): + RuntimeError.__init__(self, str(resp)) + self.resp = resp -def request(server, uri, json_in = True): - """Does a GET request of the given uri on the given HTTPConnection.""" - server.request("GET", uri) - resp = server.getresponse() - if resp.status != httplib.OK: - raise ESError(resp) - if json_in: - return json.loads(resp.read()) - else: - return resp.read() +def request(server, uri, json_in=True): + """Does a GET request of the given uri on the given HTTPConnection.""" + server.request("GET", uri) + resp = server.getresponse() + if resp.status != httplib.OK: + raise ESError(resp) + if json_in: + return json.loads(resp.read()) + else: + return resp.read() def cluster_health(server): - return request(server, "/_cluster/health") + return request(server, "/_cluster/health") def cluster_stats(server): - return request(server, "/_cluster/stats") + return request(server, "/_cluster/stats") def cluster_master_node(server): - return request(server, "/_cat/master", json_in = False).split()[0] + return request(server, "/_cat/master", json_in=False).split()[0] def index_stats(server): - return request(server, "/_cat/indices?v&bytes=b", json_in = False) + return request(server, "/_cat/indices?v&bytes=b", json_in=False) def node_status(server): - return request(server, "/") + return request(server, "/") def node_stats(server, version): - # API changed in v1.0 - if PRE_VER1.match(version): - url = "/_cluster/nodes/_local/stats" - # elif VER1.match(version): - # url = "/_nodes/_local/stats" - else: - url = "/_nodes/_local/stats" - return request(server, url) + # API changed in v1.0 + if PRE_VER1.match(version): + url = "/_cluster/nodes/_local/stats" + # elif VER1.match(version): + # url = "/_nodes/_local/stats" + else: + url = "/_nodes/_local/stats" + return request(server, url) + def printmetric(metric, ts, value, tags): - # Warning, this should be called inside a lock - if tags: - tags = " " + " ".join("%s=%s" % (name.replace(" ",""), value.replace(" ","")) - for name, value in tags.iteritems()) - else: - tags = "" - print ("%s %d %s %s" - % (metric, ts, value, tags)) + # Warning, this should be called inside a lock + if tags: + tags = " " + " ".join("%s=%s" % (name.replace(" ", ""), value.replace(" ", "")) + for name, value in tags.items()) + else: + tags = "" + print("%s %d %s %s" + % (metric, ts, value, tags)) + def _traverse(metric, stats, ts, tags): - """ - Recursively traverse the json tree and print out leaf numeric values - Please make sure you call this inside a lock and don't add locking - inside this function - """ - #print metric,stats,ts,tags - if isinstance(stats,dict): - if "timestamp" in stats: - ts = stats["timestamp"] / 1000 # ms -> s - for key in stats.keys(): - if key != "timestamp": - _traverse(metric + "." + key, stats[key], ts, tags) - if isinstance(stats, (list, set, tuple)): - count = 0 - for value in stats: - _traverse(metric + "." + str(count), value, ts, tags) - count += 1 - if utils.is_numeric(stats) and not isinstance(stats, bool): - if isinstance(stats, int): - stats = int(stats) - printmetric(metric, ts, stats, tags) - return + """ + Recursively traverse the json tree and print out leaf numeric values + Please make sure you call this inside a lock and don't add locking + inside this function + """ + # print metric,stats,ts,tags + if isinstance(stats, dict): + if "timestamp" in stats: + ts = stats["timestamp"] / 1000 # ms -> s + for key in list(stats.keys()): + if key != "timestamp": + _traverse(metric + "." + key, stats[key], ts, tags) + if isinstance(stats, (list, set, tuple)): + count = 0 + for value in stats: + _traverse(metric + "." + str(count), value, ts, tags) + count += 1 + if utils.is_numeric(stats) and not isinstance(stats, bool): + if isinstance(stats, int): + stats = int(stats) + printmetric(metric, ts, stats, tags) + return + def _collect_indices(server, metric, tags, lock): - ts = int(time.time()) - rawtable = index_stats(server).split("\n") - header = rawtable.pop(0).strip() - headerlist = [x.strip() for x in header.split()] - for line in rawtable: - # Copy the cluster tag - newtags = {"cluster": tags["cluster"]} - # Now parse each input - values = line.split() - count = 0 - for value in values: - try: - value = float(value) - if int(value) == value: - value = int(value) - # now print value - with lock: - printmetric(metric + ".cluster.byindex." + headerlist[count], ts, value, newtags) - except ValueError, ve: - # add this as a tag - newtags[headerlist[count]] = value - count += 1 + ts = int(time.time()) + rawtable = index_stats(server).split("\n") + header = rawtable.pop(0).strip() + headerlist = [x.strip() for x in header.split()] + for line in rawtable: + # Copy the cluster tag + newtags = {"cluster": tags["cluster"]} + # Now parse each input + values = line.split() + count = 0 + for value in values: + try: + value = float(value) + if int(value) == value: + value = int(value) + # now print value + with lock: + printmetric(metric + ".cluster.byindex." + headerlist[count], ts, value, newtags) + except ValueError as ve: + # add this as a tag + newtags[headerlist[count]] = value + count += 1 + def _collect_master(server, nodeid, metric, tags, lock): - ts = int(time.time()) - chealth = cluster_health(server) - if "status" in chealth: + ts = int(time.time()) + chealth = cluster_health(server) + if "status" in chealth: + with lock: + printmetric(metric + ".cluster.status", ts, + STATUS_MAP.get(chealth["status"], -1), tags) with lock: - printmetric(metric + ".cluster.status", ts, - STATUS_MAP.get(chealth["status"], -1), tags) - with lock: - _traverse(metric + ".cluster", chealth, ts, tags) + _traverse(metric + ".cluster", chealth, ts, tags) - ts = int(time.time()) # In case last call took a while. - cstats = cluster_stats(server) - with lock: - _traverse(metric + ".cluster", cstats, ts, tags) + ts = int(time.time()) # In case last call took a while. + cstats = cluster_stats(server) + with lock: + _traverse(metric + ".cluster", cstats, ts, tags) -def _collect_server(server, version, lock): - ts = int(time.time()) - rootmetric = "elasticsearch" - nstats = node_stats(server, version) - cluster_name = nstats["cluster_name"] - nodeid, nstats = nstats["nodes"].popitem() - node_name = nstats["name"] - tags = {"cluster": cluster_name, "node": node_name} - #tags.update(nstats["attributes"]) - is_master = nodeid == cluster_master_node(server) - with lock: - printmetric(rootmetric + ".is_master", ts, is_master, tags) - if is_master: - _collect_master(server, nodeid, rootmetric, tags, lock) +def _collect_server(server, version, lock): + ts = int(time.time()) + rootmetric = "elasticsearch" + nstats = node_stats(server, version) + cluster_name = nstats["cluster_name"] + nodeid, nstats = nstats["nodes"].popitem() + node_name = nstats["name"] + tags = {"cluster": cluster_name, "node": node_name} + # tags.update(nstats["attributes"]) + + is_master = nodeid == cluster_master_node(server) + with lock: + printmetric(rootmetric + ".is_master", ts, is_master, tags) + if is_master: + _collect_master(server, nodeid, rootmetric, tags, lock) - _collect_indices(server, rootmetric, tags, lock) + _collect_indices(server, rootmetric, tags, lock) - with lock: - _traverse(rootmetric, nstats, ts, tags) + with lock: + _traverse(rootmetric, nstats, ts, tags) def main(argv): - utils.drop_privileges() - socket.setdefaulttimeout(DEFAULT_TIMEOUT) - servers = [] - - if json is None: - utils.err("This collector requires the `json' Python module.") - return 1 - - for conf in elasticsearch_conf.get_servers(): - server = httplib.HTTPConnection( *conf ) - try: - server.connect() - except socket.error, (erno, e): - if erno == errno.ECONNREFUSED: - continue - raise - servers.append( server ) - - if len( servers ) == 0: - return 13 # No ES running, ask tcollector to not respawn us. - - lock = threading.Lock() - while True: - threads = [] - for server in servers: - status = node_status(server) - version = status["version"]["number"] - t = threading.Thread(target = _collect_server, args = (server, version, lock)) - t.start() - threads.append(t) - for thread in threads: - t.join() - time.sleep(COLLECTION_INTERVAL) + utils.drop_privileges() + socket.setdefaulttimeout(DEFAULT_TIMEOUT) + servers = [] + + if json is None: + utils.err("This collector requires the `json' Python module.") + return 1 + + for conf in elasticsearch_conf.get_servers(): + server = httplib.HTTPConnection(*conf) + try: + server.connect() + except socket.error as sock_err: + if sock_err.errno == errno.ECONNREFUSED: + continue + raise + servers.append(server) + + if len(servers) == 0: + return 13 # No ES running, ask tcollector to not respawn us. + + lock = threading.Lock() + while True: + threads = [] + for server in servers: + status = node_status(server) + version = status["version"]["number"] + t = threading.Thread(target=_collect_server, args=(server, version, lock)) + t.start() + threads.append(t) + for thread in threads: + thread.join() + time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/flume.py b/collectors/0/flume.py index 9d023f20..93ef4e98 100755 --- a/collectors/0/flume.py +++ b/collectors/0/flume.py @@ -26,107 +26,117 @@ Based on the elastichsearch collector -""" +""" import errno -import httplib -try: - import json -except ImportError: - json = None # Handled gracefully in main. Not available by default in <2.6 import socket import sys import time +is_py2 = sys.version[0] == '2' +if is_py2: + import httplib as httplib +else: + import http.client as httplib + +try: + import json +except ImportError: + json = None # Handled gracefully in main. Not available by default in <2.6 + from collectors.lib import utils try: - from collectors.etc import flume_conf + from collectors.etc import flume_conf except ImportError: - flume_conf = None + flume_conf = None COLLECTION_INTERVAL = 15 # seconds -DEFAULT_TIMEOUT = 10.0 # seconds +DEFAULT_TIMEOUT = 10.0 # seconds FLUME_HOST = "localhost" FLUME_PORT = 34545 # Exclude values that are not really metrics and totally pointless to keep track of -EXCLUDE = [ 'StartTime', 'StopTime', 'Type' ] +EXCLUDE = ['StartTime', 'StopTime', 'Type'] + def err(msg): - print >>sys.stderr, msg + utils.err(msg) + class FlumeError(RuntimeError): - """Exception raised if we don't get a 200 OK from Flume webserver.""" - def __init__(self, resp): - RuntimeError.__init__(self, str(resp)) - self.resp = resp + """Exception raised if we don't get a 200 OK from Flume webserver.""" + + def __init__(self, resp): + RuntimeError.__init__(self, str(resp)) + self.resp = resp + def request(server, uri): - """Does a GET request of the given uri on the given HTTPConnection.""" - server.request("GET", uri) - resp = server.getresponse() - if resp.status != httplib.OK: - raise FlumeError(resp) - return json.loads(resp.read()) + """Does a GET request of the given uri on the given HTTPConnection.""" + server.request("GET", uri) + resp = server.getresponse() + if resp.status != httplib.OK: + raise FlumeError(resp) + return json.loads(resp.read()) def flume_metrics(server): - return request(server, "/metrics") + return request(server, "/metrics") + def main(argv): - if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()): - sys.exit(13) - - settings = flume_conf.get_settings() - - if (settings['default_timeout']): - DEFAULT_TIMEOUT = settings['default_timeout'] - - if (settings['default_timeout']): - COLLECTION_INTERVAL = settings['collection_interval'] - - if (settings['flume_host']): - FLUME_HOST = settings['flume_host'] - - if (settings['flume_port']): - FLUME_PORT = settings['flume_port'] - - utils.drop_privileges() - socket.setdefaulttimeout(DEFAULT_TIMEOUT) - server = httplib.HTTPConnection(FLUME_HOST, FLUME_PORT) - try: - server.connect() - except socket.error, (erno, e): - if erno == errno.ECONNREFUSED: - return 13 # No Flume server available, ask tcollector to not respawn us. - raise - if json is None: - err("This collector requires the `json' Python module.") - return 1 - - def printmetric(metric, value, **tags): - if tags: - tags = " " + " ".join("%s=%s" % (name, value) - for name, value in tags.iteritems()) - else: - tags = "" - print ("flume.%s %d %s %s" % (metric, ts, value, tags)) - - while True: - # Get the metrics - ts = int(time.time()) # In case last call took a while. - stats = flume_metrics(server) - - for metric in stats: - (component, name) = metric.split(".") - tags = {component.lower(): name} - for key,value in stats[metric].items(): - if key not in EXCLUDE: - printmetric(key.lower(), value, **tags) - - time.sleep(COLLECTION_INTERVAL) + if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()): + sys.exit(13) + + settings = flume_conf.get_settings() + + if (settings['default_timeout']): + DEFAULT_TIMEOUT = settings['default_timeout'] + + if (settings['default_timeout']): + COLLECTION_INTERVAL = settings['collection_interval'] + + if (settings['flume_host']): + FLUME_HOST = settings['flume_host'] + + if (settings['flume_port']): + FLUME_PORT = settings['flume_port'] + + utils.drop_privileges() + socket.setdefaulttimeout(DEFAULT_TIMEOUT) + server = httplib.HTTPConnection(FLUME_HOST, FLUME_PORT) + try: + server.connect() + except socket.error as sock_err: + if sock_err.errno == errno.ECONNREFUSED: + return 13 # No Flume server available, ask tcollector to not respawn us. + raise + if json is None: + err("This collector requires the `json' Python module.") + return 1 + + def printmetric(metric, value, **tags): + if tags: + tags = " " + " ".join("%s=%s" % (name, value) for name, value in tags.items()) + else: + tags = "" + print("flume.%s %d %s %s" % (metric, ts, value, tags)) + + while True: + # Get the metrics + ts = int(time.time()) # In case last call took a while. + stats = flume_metrics(server) + + for metric in stats: + (component, name) = metric.split(".") + tags = {component.lower(): name} + for key, value in stats[metric].items(): + if key not in EXCLUDE: + printmetric(key.lower(), value, **tags) + + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/g1gc.py b/collectors/0/g1gc.py index 1ff4386b..1457289e 100755 --- a/collectors/0/g1gc.py +++ b/collectors/0/g1gc.py @@ -93,8 +93,9 @@ FLOAT_NUMBER_PATTERN = '\d+(?:\.\d+)?' pattern_map = { - GC_START_TIME_PATTERN: re.compile('^(20\d\d)\-([01]\d)\-([0123]\d)T([012]\d):([012345]\d):([012345]\d).\d\d\d([+-]\d\d\d\d):\s*%s:\s*\[\s*(.+)' % - FLOAT_NUMBER_PATTERN), + GC_START_TIME_PATTERN: re.compile( + '^(20\d\d)\-([01]\d)\-([0123]\d)T([012]\d):([012345]\d):([012345]\d).\d\d\d([+-]\d\d\d\d):\s*%s:\s*\[\s*(.+)' % + FLOAT_NUMBER_PATTERN), # [Parallel Time: 157.1 ms] # Parallel Time is the total elapsed time spent by all the parallel GC worker threads. The following lines correspond to the parallel tasks performed by these worker threads in this total parallel time, which in this case is 157.1 ms. PARALLEL_TIME_PATTERN: re.compile('\s*\[Parallel Time:\s*(%s) ms,\s*GC Workers:\s*(\d+)\]\s*' % @@ -106,10 +107,15 @@ # [Times: user=0.52 sys=0.01, real=0.05 secs] REMARK_PATTERN: re.compile('GC remark.*\[GC ref-proc,\s*(\d+\.\d+)\s*secs\],\s*(\d+\.\d+)\s*secs\]$'), # [Scan RS (ms): Min: 0.0, Avg: 0.1, Max: 0.2, Diff: 0.2, Sum: 1.7] - SCAN_RS_PATTERN: re.compile('\s*\[Scan RS \(ms\): Min: (%s), Avg: (%s), Max: (%s)' % (FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN)), - OBJECT_COPY_PATTERN: re.compile('\s*\[Object Copy \(ms\): Min: (%s), Avg: (%s), Max: (%s)' % (FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN)), + SCAN_RS_PATTERN: re.compile('\s*\[Scan RS \(ms\): Min: (%s), Avg: (%s), Max: (%s)' % ( + FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN)), + OBJECT_COPY_PATTERN: re.compile('\s*\[Object Copy \(ms\): Min: (%s), Avg: (%s), Max: (%s)' % ( + FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN)), # [Eden: 3584.0M(3584.0M)->0.0B(3584.0M) Survivors: 512.0M->512.0M Heap: 91.2G(100.8G)->87.9G(100.8G)] - ALLOCATION_PATTERN: re.compile('^\s*\[Eden: (%s)([BMG])\(\d+(?:\.\d+)?[MG]\)\->(%s)([BMG])\((%s)([BMG])\) Survivors: (%s)([BMG])\->(%s)([BMG]).+Heap: (%s)G\((%s)G\)\->(%s)G\((%s)G\).+' % (FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN)), + ALLOCATION_PATTERN: re.compile( + '^\s*\[Eden: (%s)([BMG])\(\d+(?:\.\d+)?[MG]\)\->(%s)([BMG])\((%s)([BMG])\) Survivors: (%s)([BMG])\->(%s)([BMG]).+Heap: (%s)G\((%s)G\)\->(%s)G\((%s)G\).+' % ( + FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, + FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN, FLOAT_NUMBER_PATTERN)), # [Free CSet: 1.1 ms] # Time spent in freeing the collection set data structure. FREE_CSET_PATTERN: re.compile('^\s*\[Free CSet: (%s) ms\]$' % FLOAT_NUMBER_PATTERN), @@ -128,11 +134,13 @@ } + # Utilities def get_file_end(file_handler): file_handler.seek(0, 2) return file_handler.tell() + def get_latest_gc_log(log_dir, log_name_pattern): sorted_gc_logs = sorted(glob.glob(os.path.join(log_dir, log_name_pattern))) if len(sorted_gc_logs) == 0: @@ -140,15 +148,21 @@ def get_latest_gc_log(log_dir, log_name_pattern): log_dir + '" with pattern: "' + log_name_pattern + '"') return sorted_gc_logs[-1] + def true_unix_timestamp(year, month, day, hour, minute, second, timezone): d = datetime(year, month, day, hour, minute, second) - timedelta(seconds=36 * timezone) return calendar.timegm(d.utctimetuple()) + def to_size_in_mb(data_size, unit): '''Convert size in given unit: GB or B to size in MB ''' - if unit == 'G': return data_size * 1024 - elif unit == 'B': return data_size / (1024 * 1024.0) - else: return data_size + if unit == 'G': + return data_size * 1024 + elif unit == 'B': + return data_size / (1024 * 1024.0) + else: + return data_size + def match_pattern(line): for pattern_name, pattern in pattern_map.items(): @@ -156,6 +170,7 @@ def match_pattern(line): if m: return (pattern_name, m) return (None, None) + def match_until(file_handler, pattern): while True: line = file_handler.readline() @@ -164,16 +179,19 @@ def match_until(file_handler, pattern): if m: return m return None + def sec2milli(seconds): return 1000 * seconds + def flush_collector(collector): for metric_name, value in collector['data'].items(): - print metric_name % (collector['timestamp'], value) + print(metric_name % (collector['timestamp'], value)) collector['timestamp'] = None collector['data'] = {} + def collect_metric(metric_name, timestamp, value, collector): if collector['timestamp'] != timestamp: flush_collector(collector) @@ -181,6 +199,7 @@ def collect_metric(metric_name, timestamp, value, collector): collector['timestamp'] = timestamp collector['data'][metric_name] = collector['data'].get(metric_name, 0) + value + def collect_metric_with_prefix(prefix, metric_name, timestamp, value, collector): new_metric_name = metric_name p = '' if prefix is None else prefix.strip() @@ -188,38 +207,48 @@ def collect_metric_with_prefix(prefix, metric_name, timestamp, value, collector) new_metric_name = '.'.join([p, metric_name]) collect_metric(new_metric_name, timestamp, value, collector) + def unmatched_gc_log(line): pass + # Simple gc events, don't have inner gc events def concurrent_cleanup_handler(prefix, log_line, timestamp, collector, file_handler): concurrent_clean_up_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.concurrent_cleanup %s %s", timestamp, concurrent_clean_up_time, collector) + def concurrent_mark_handler(prefix, log_line, timestamp, collector, file_handler): concurrent_mark_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.concurrent_mark %s %s", timestamp, concurrent_mark_time, collector) + def concurrent_root_region_scan_handler(prefix, log_line, timestamp, collector, file_handler): concurrent_root_region_scan_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) - collect_metric_with_prefix(prefix, "gc.g1.concurrent_root_region_scan %s %s", timestamp, concurrent_root_region_scan_time, collector) + collect_metric_with_prefix(prefix, "gc.g1.concurrent_root_region_scan %s %s", timestamp, + concurrent_root_region_scan_time, collector) + def cleanup_handler(prefix, log_line, timestamp, collector, file_handler): clean_up_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=cleanup", timestamp, clean_up_time, collector) + def fullgc_handler(prefix, log_line, timestamp, collector, file_handler): full_gc_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.fullgc.duration %s %s", timestamp, full_gc_time, collector) + # Inner gc events, which we should have a matcher object def parallel_time_handler(prefix, matcher, timestamp, collector, file_handler): parallel_time, num_of_gc_workers = float(matcher.group(1)), float(matcher.group(2)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=parallel-time", timestamp, parallel_time, collector) + def object_copy_handler(prefix, matcher, timestamp, collector, file_handler): min_time, avg_time, max_time = [float(matcher.group(i)) for i in range(1, 4)] collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=object-copy", timestamp, avg_time, collector) + def allocation_handler(prefix, matcher, timestamp, collector, file_handler): eden_before_in_size, eden_after_in_size = matcher.group(2), matcher.group(4) eden_before = to_size_in_mb(float(matcher.group(1)), eden_before_in_size) @@ -237,87 +266,118 @@ def allocation_handler(prefix, matcher, timestamp, collector, file_handler): heap_after_in_mb = to_size_in_mb(heap_after, 'G') collect_metric_with_prefix(prefix, "gc.g1.allocation %s %s", timestamp, eden_before - eden_after, collector) - collect_metric_with_prefix(prefix, "gc.g1.promotion %s %s", timestamp, (eden_before - eden_after) - (heap_before - heap_after), collector) - collect_metric_with_prefix(prefix, "gc.g1.heap_ratio.before %s %s", timestamp, heap_before / heap_total_size_before, collector) - collect_metric_with_prefix(prefix, "gc.g1.heap_ratio.after %s %s", timestamp, heap_after / heap_total_size_after, collector) + collect_metric_with_prefix(prefix, "gc.g1.promotion %s %s", timestamp, + (eden_before - eden_after) - (heap_before - heap_after), collector) + collect_metric_with_prefix(prefix, "gc.g1.heap_ratio.before %s %s", timestamp, heap_before / heap_total_size_before, + collector) + collect_metric_with_prefix(prefix, "gc.g1.heap_ratio.after %s %s", timestamp, heap_after / heap_total_size_after, + collector) collector['gensize']['eden'] = eden_capacity_after collector['gensize']['survivor'] = survivor_after collector['gensize']['heap'] = heap_after_in_mb + def free_cset_handler(prefix, matcher, timestamp, collector, file_handler): free_cset_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=free-cset", timestamp, free_cset_time, collector) + def ref_enq_handler(prefix, matcher, timestamp, collector, file_handler): ref_enq_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=ref-enq", timestamp, ref_enq_time, collector) + def ref_proc_handler(prefix, matcher, timestamp, collector, file_handler): ref_proc_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=ref-proc", timestamp, ref_proc_time, collector) + def choose_cset_handler(prefix, matcher, timestamp, collector, file_handler): choose_cset_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=choose-cset", timestamp, choose_cset_time, collector) + def clear_ct_handler(prefix, matcher, timestamp, collector, file_handler): clear_ct_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=clear-ct", timestamp, clear_ct_time, collector) + def scan_rs_handler(prefix, matcher, timestamp, collector, file_handler): min_time, avg_time, max_time = [float(matcher.group(i)) for i in range(1, 4)] collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=scan-rs", timestamp, avg_time, collector) + # Complex GC events: initial-mark, young-pause, mixed-pause and remark # These GC events contains several inner gc events and we must call match_remaining_log to parse remaining gc events def initial_mark_handler(prefix, log_line, timestamp, collector, file_handler): m = match_until(file_handler, pattern_map[GC_PAUSE_PATTERN]) initial_mark_pause_time = sec2milli(float(m.group(1))) - collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=initial-mark", timestamp, initial_mark_pause_time, collector) + collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=initial-mark", timestamp, initial_mark_pause_time, + collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def young_pause_handler(prefix, log_line, timestamp, collector, file_handler): m = match_until(file_handler, pattern_map[GC_PAUSE_PATTERN]) young_pause_time = sec2milli(float(m.group(1))) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=young-pause", timestamp, young_pause_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def mixed_pause_handler(prefix, log_line, timestamp, collector, file_handler): m = match_until(file_handler, pattern_map[GC_PAUSE_PATTERN]) mixed_pause_time = sec2milli(float(m.group(1))) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=mixed-pause", timestamp, mixed_pause_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def remark_handler(prefix, log_line, timestamp, collector, file_handler): - m = pattern_map[REMARK_PATTERN].match(log_line) + m = pattern_map[REMARK_PATTERN].match(log_line) remark_time, ref_process_time = [sec2milli(float(m.group(i))) for i in range(1, 3)] collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=remark", timestamp, remark_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def match_remaining_log(prefix, timestamp, collector, file_handler): while True: line = file_handler.readline() if len(line) == 0: break pattern_name, matcher = match_pattern(line) - if pattern_name == GC_END_TIME_PATTERN: break - elif pattern_name == PARALLEL_TIME_PATTERN: parallel_time_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == SCAN_RS_PATTERN: scan_rs_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == OBJECT_COPY_PATTERN: object_copy_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == ALLOCATION_PATTERN: allocation_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == FREE_CSET_PATTERN: free_cset_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == REF_ENQ_PATTERN: ref_enq_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == REF_PROC_PATTERN: ref_proc_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == CHOOSE_CSET_PATTERN: choose_cset_handler(prefix, matcher, timestamp, collector, file_handler) - elif pattern_name == CLEAR_CT_PATTERN: clear_ct_handler(prefix, matcher, timestamp, collector, file_handler) - else: unmatched_gc_log(line) + if pattern_name == GC_END_TIME_PATTERN: + break + elif pattern_name == PARALLEL_TIME_PATTERN: + parallel_time_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == SCAN_RS_PATTERN: + scan_rs_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == OBJECT_COPY_PATTERN: + object_copy_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == ALLOCATION_PATTERN: + allocation_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == FREE_CSET_PATTERN: + free_cset_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == REF_ENQ_PATTERN: + ref_enq_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == REF_PROC_PATTERN: + ref_proc_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == CHOOSE_CSET_PATTERN: + choose_cset_handler(prefix, matcher, timestamp, collector, file_handler) + elif pattern_name == CLEAR_CT_PATTERN: + clear_ct_handler(prefix, matcher, timestamp, collector, file_handler) + else: + unmatched_gc_log(line) + def process_gc_record(prefix, file_handler, timestamp, cause, collector): # process simple gc events - if 'concurrent-cleanup-end' in cause: concurrent_cleanup_handler(prefix, cause, timestamp, collector, file_handler) - elif 'concurrent-mark-end' in cause: concurrent_mark_handler(prefix, cause, timestamp, collector, file_handler) - elif 'concurrent-root-region-scan-end' in cause: concurrent_root_region_scan_handler(prefix, cause, timestamp, collector, file_handler) - elif 'GC cleanup' in cause: cleanup_handler(prefix, cause, timestamp, collector, file_handler) + if 'concurrent-cleanup-end' in cause: + concurrent_cleanup_handler(prefix, cause, timestamp, collector, file_handler) + elif 'concurrent-mark-end' in cause: + concurrent_mark_handler(prefix, cause, timestamp, collector, file_handler) + elif 'concurrent-root-region-scan-end' in cause: + concurrent_root_region_scan_handler(prefix, cause, timestamp, collector, file_handler) + elif 'GC cleanup' in cause: + cleanup_handler(prefix, cause, timestamp, collector, file_handler) elif 'Full GC' in cause: collector['count']['fullgc'] += 1 fullgc_handler(prefix, cause, timestamp, collector, file_handler) @@ -335,10 +395,11 @@ def process_gc_record(prefix, file_handler, timestamp, cause, collector): elif 'remark' in cause: collector['count']['remark'] += 1 remark_handler(prefix, cause, timestamp, collector, file_handler) - elif cause[-1] == ']': return + elif cause[-1] == ']': + return -def process_gc_log(collector): +def process_gc_log(collector): prefix = collector['prefix'] # get latest gc log to process gc_log = get_latest_gc_log(collector['log_dir'], collector['log_name_pattern']) @@ -377,11 +438,11 @@ def process_gc_log(collector): if not collector['timestamp'] is None: for gen, value in collector['gensize'].items(): - print "%s.gc.g1.gensize %s %s gen=%s" % (prefix, current_timestamp_in_sec, value, gen) + print("%s.gc.g1.gensize %s %s gen=%s" % (prefix, current_timestamp_in_sec, value, gen)) # publish gc event count metrics for event, value in collector['count'].items(): - print "%s.gc.g1.event.count %s %s event=%s" % (prefix, current_timestamp_in_sec, value, event) + print("%s.gc.g1.event.count %s %s event=%s" % (prefix, current_timestamp_in_sec, value, event)) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() @@ -390,8 +451,8 @@ def process_gc_log(collector): return 0 -def main(): +def main(): interval = g1gc_conf.get_interval() config = g1gc_conf.get_gc_config() counters = {'young': 0, 'mixed': 0, 'initialmark': 0, @@ -412,5 +473,6 @@ def main(): sys.stdout.flush() time.sleep(interval) + if __name__ == '__main__': exit(main()) diff --git a/collectors/0/graphite_bridge.py b/collectors/0/graphite_bridge.py index 9b74cee7..8b244d46 100755 --- a/collectors/0/graphite_bridge.py +++ b/collectors/0/graphite_bridge.py @@ -13,38 +13,43 @@ # see . """Listens on a local TCP socket for incoming metrics in the graphite protocol.""" -from __future__ import print_function - import sys -from collectors.lib import utils -import SocketServer import threading +is_py2 = sys.version[0] == '2' +if is_py2: + import SocketServer as socketserver +else: + import socketserver as socketserver + +from collectors.lib import utils + try: - from collectors.etc import graphite_bridge_conf + from collectors.etc import graphite_bridge_conf except ImportError: - graphite_bridge_conf = None + graphite_bridge_conf = None HOST = '127.0.0.1' PORT = 2003 SIZE = 8192 -class GraphiteServer(SocketServer.ThreadingTCPServer): + +class GraphiteServer(socketserver.ThreadingTCPServer): allow_reuse_address = True print_lock = threading.Lock() -class GraphiteHandler(SocketServer.BaseRequestHandler): + +class GraphiteHandler(socketserver.BaseRequestHandler): def handle_line(self, line): line_parts = line.split() with self.server.print_lock: if len(line_parts) != 3: - print("Bad data:", line, file=sys.stderr) + utils.err("Bad data: %s" % line) else: print(line_parts[0], line_parts[2], line_parts[1]) - def handle(self): data = '' while True: @@ -65,7 +70,7 @@ def handle(self): def main(): if not (graphite_bridge_conf and graphite_bridge_conf.enabled()): - sys.exit(13) + sys.exit(13) utils.drop_privileges() server = GraphiteServer((HOST, PORT), GraphiteHandler) @@ -76,6 +81,7 @@ def main(): server.shutdown() server.server_close() + if __name__ == "__main__": main() diff --git a/collectors/0/gstat.py b/collectors/0/gstat.py index dc881275..66806ffc 100755 --- a/collectors/0/gstat.py +++ b/collectors/0/gstat.py @@ -58,22 +58,25 @@ except ImportError: gstat_conf = None -DEFAULT_COLLECTION_INTERVAL=15 +DEFAULT_COLLECTION_INTERVAL = 15 signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """top main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - collection_filter=".*" - if(gstat_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + collection_filter = ".*" + if (gstat_conf): config = gstat_conf.get_config() - collection_interval=config['collection_interval'] - collection_filter=config['collection_filter'] + collection_interval = config['collection_interval'] + collection_filter = config['collection_filter'] global signal_received @@ -82,13 +85,13 @@ def main(): try: p_gstat = subprocess.Popen( - ["gstat", "-B", "-d", "-o", "-s", "-I"+str(collection_interval)+"s", "-f"+str(collection_filter)], + ["gstat", "-B", "-d", "-o", "-s", "-I" + str(collection_interval) + "s", "-f" + str(collection_filter)], stdout=subprocess.PIPE, ) - except OSError, e: + except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise timestamp = 0 @@ -96,7 +99,7 @@ def main(): while signal_received is None: try: line = p_gstat.stdout.readline() - except (IOError, OSError), e: + except (IOError, OSError) as e: if e.errno in (errno.EINTR, errno.EAGAIN): break raise @@ -105,31 +108,31 @@ def main(): # end of the program, die break - if (not re.match("^ *[0-9]",line)): + if (not re.match("^ *[0-9]", line)): timestamp = int(time.time()) continue fields = line.split() - print "disk.queue %s %s disk=%s" % (timestamp, fields[0], fields[17]) - print "disk.ops.read %s %s disk=%s" % (timestamp, fields[2], fields[17]) - print "disk.b.read %s %d disk=%s" % (timestamp, float(fields[3])*1024, fields[17]) - print "disk.bps.read %s %d disk=%s" % (timestamp, float(fields[4])*1024, fields[17]) - print "disk.ms.read %s %s disk=%s" % (timestamp, float(fields[5]), fields[17]) - print "disk.ops.write %s %s disk=%s" % (timestamp, fields[6], fields[17]) - print "disk.b.write %s %d disk=%s" % (timestamp, float(fields[7])*1024, fields[17]) - print "disk.bps.write %s %d disk=%s" % (timestamp, float(fields[8])*1024, fields[17]) - print "disk.ms.write %s %s disk=%s" % (timestamp, float(fields[9]), fields[17]) - print "disk.ops.delete %s %s disk=%s" % (timestamp, fields[10], fields[17]) - print "disk.b.delete %s %d disk=%s" % (timestamp, float(fields[11])*1024, fields[17]) - print "disk.bps.delete %s %d disk=%s" % (timestamp, float(fields[12])*1024, fields[17]) - print "disk.ms.delete %s %s disk=%s" % (timestamp, float(fields[13]), fields[17]) - print "disk.ops.other %s %s disk=%s" % (timestamp, fields[14], fields[17]) - print "disk.ms.other %s %s disk=%s" % (timestamp, float(fields[15]), fields[17]) - print "disk.busy %s %s disk=%s" % (timestamp, fields[16], fields[17]) + print("disk.queue %s %s disk=%s" % (timestamp, fields[0], fields[17])) + print("disk.ops.read %s %s disk=%s" % (timestamp, fields[2], fields[17])) + print("disk.b.read %s %d disk=%s" % (timestamp, float(fields[3]) * 1024, fields[17])) + print("disk.bps.read %s %d disk=%s" % (timestamp, float(fields[4]) * 1024, fields[17])) + print("disk.ms.read %s %s disk=%s" % (timestamp, float(fields[5]), fields[17])) + print("disk.ops.write %s %s disk=%s" % (timestamp, fields[6], fields[17])) + print("disk.b.write %s %d disk=%s" % (timestamp, float(fields[7]) * 1024, fields[17])) + print("disk.bps.write %s %d disk=%s" % (timestamp, float(fields[8]) * 1024, fields[17])) + print("disk.ms.write %s %s disk=%s" % (timestamp, float(fields[9]), fields[17])) + print("disk.ops.delete %s %s disk=%s" % (timestamp, fields[10], fields[17])) + print("disk.b.delete %s %d disk=%s" % (timestamp, float(fields[11]) * 1024, fields[17])) + print("disk.bps.delete %s %d disk=%s" % (timestamp, float(fields[12]) * 1024, fields[17])) + print("disk.ms.delete %s %s disk=%s" % (timestamp, float(fields[13]), fields[17])) + print("disk.ops.other %s %s disk=%s" % (timestamp, fields[14], fields[17])) + print("disk.ms.other %s %s disk=%s" % (timestamp, float(fields[15]), fields[17])) + print("disk.busy %s %s disk=%s" % (timestamp, fields[16], fields[17])) sys.stdout.flush() - + if signal_received is None: signal_received = signal.SIGTERM try: @@ -138,5 +141,6 @@ def main(): pass p_gstat.wait() + if __name__ == "__main__": main() diff --git a/collectors/0/hadoop_datanode.py b/collectors/0/hadoop_datanode.py index e50c122c..32abea73 100755 --- a/collectors/0/hadoop_datanode.py +++ b/collectors/0/hadoop_datanode.py @@ -23,7 +23,6 @@ from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp - REPLACEMENTS = { "datanodeactivity-": ["activity"], "fsdatasetstate-ds-": ["fs_data_set_state"], @@ -47,7 +46,7 @@ def emit(self): current_time = int(time.time()) metrics = self.poll() for context, metric_name, value in metrics: - for k, v in REPLACEMENTS.iteritems(): + for k, v in REPLACEMENTS.items(): if any(c.startswith(k) for c in context): context = v self.emit_metric(context, current_time, metric_name, value) @@ -67,4 +66,3 @@ def main(args): if __name__ == "__main__": sys.exit(main(sys.argv)) - diff --git a/collectors/0/hadoop_namenode.py b/collectors/0/hadoop_namenode.py index 5168e3fd..f1af39c4 100755 --- a/collectors/0/hadoop_namenode.py +++ b/collectors/0/hadoop_namenode.py @@ -23,7 +23,6 @@ from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp - REPLACEMENTS = { "rpcdetailedactivityforport": ["rpc_activity"], "rpcactivityforport": ["rpc_activity"] @@ -45,7 +44,7 @@ def emit(self): current_time = int(time.time()) metrics = self.poll() for context, metric_name, value in metrics: - for k, v in REPLACEMENTS.iteritems(): + for k, v in REPLACEMENTS.items(): if any(c.startswith(k) for c in context): context = v self.emit_metric(context, current_time, metric_name, value) @@ -65,4 +64,3 @@ def main(args): if __name__ == "__main__": sys.exit(main(sys.argv)) - diff --git a/collectors/0/haproxy.py b/collectors/0/haproxy.py index 03234c5b..39e48d5c 100755 --- a/collectors/0/haproxy.py +++ b/collectors/0/haproxy.py @@ -90,50 +90,53 @@ "srv_abrt": "server_aborted_data_transfers" } + def haproxy_pid(): - """Finds out the pid of haproxy process""" - try: - pid = subprocess.check_output(["pidof", "haproxy"]) - except subprocess.CalledProcessError: - return None - return pid.rstrip() + """Finds out the pid of haproxy process""" + try: + pid = subprocess.check_output(["pidof", "haproxy"]) + except subprocess.CalledProcessError: + return None + return pid.rstrip() + def find_conf_file(pid): - """Returns the conf file of haproxy.""" - try: - output = subprocess.check_output(["ps", "--no-headers", "-o", "cmd", pid]) - except subprocess.CalledProcessError, e: - utils.err("HAProxy (pid %s) went away? %s" % (pid, e)) - return None - return output.split("-f")[1].split()[0] + """Returns the conf file of haproxy.""" + try: + output = subprocess.check_output(["ps", "--no-headers", "-o", "cmd", pid]) + except subprocess.CalledProcessError as e: + utils.err("HAProxy (pid %s) went away? %s" % (pid, e)) + return None + return output.split(b"-f")[1].split()[0] + def find_sock_file(conf_file): - """Returns the unix socket file of haproxy.""" - try: - fd = open(conf_file) - except IOError, e: - utils.err("Error: %s. Config file path is relative: %s" % (e, conf_file)) - return None - try: - for line in fd: - if line.lstrip(" \t").startswith("stats socket"): - sock_file = line.split()[2] - if utils.is_sockfile(sock_file): - return sock_file - finally: - fd.close() + """Returns the unix socket file of haproxy.""" + try: + fd = open(conf_file) + except IOError as e: + utils.err("Error: %s. Config file path is relative: %s" % (e, conf_file)) + return None + try: + for line in fd: + if line.lstrip(" \t").startswith("stats socket"): + sock_file = line.split()[2] + if utils.is_sockfile(sock_file): + return sock_file + finally: + fd.close() def collect_stats(sock_file): """Collects stats from haproxy unix domain socket""" - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: - sock.settimeout(COLLECTION_INTERVAL) - sock.connect(sock_file) - sock.send("show stat\n") - statlines = sock.recv(10240).split('\n') + sock.settimeout(COLLECTION_INTERVAL) + sock.connect(sock_file) + sock.send("show stat\n") + statlines = sock.recv(10240).split('\n') finally: - sock.close() + sock.close() ts = time.time() # eat up any empty lines that may be present @@ -182,7 +185,7 @@ def print_metric(line, metric, timestamp): value = line[metric] if not value: value = 0 - print ("haproxy.%s %i %s source=%s cluster=%s" + print("haproxy.%s %i %s source=%s cluster=%s" % (METRIC_NAMES[metric], timestamp, value, @@ -191,24 +194,24 @@ def print_metric(line, metric, timestamp): def main(): - pid = haproxy_pid() - if not pid: - utils.err("Error: HAProxy is not running") - return 13 # Ask tcollector to not respawn us. + pid = haproxy_pid() + if not pid: + utils.err("Error: HAProxy is not running") + return 13 # Ask tcollector to not respawn us. - conf_file = find_conf_file(pid) - if not conf_file: - return 13 + conf_file = find_conf_file(pid) + if not conf_file: + return 13 - sock_file = find_sock_file(conf_file) - if sock_file is None: - utils.err("Error: HAProxy is not listening on any unix domain socket") - return 13 + sock_file = find_sock_file(conf_file) + if sock_file is None: + utils.err("Error: HAProxy is not listening on any unix domain socket") + return 13 + while True: + collect_stats(sock_file) + time.sleep(COLLECTION_INTERVAL) - while True: - collect_stats(sock_file) - time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/hbase_regionserver.py b/collectors/0/hbase_regionserver.py index 8cf3ef5a..0aaa3e0f 100755 --- a/collectors/0/hbase_regionserver.py +++ b/collectors/0/hbase_regionserver.py @@ -28,12 +28,13 @@ EXCLUDED_CONTEXTS = ("master") REGION_METRIC_PATTERN = re.compile(r"[N|n]amespace_(.*)_table_(.*)_region_(.*)_metric_(.*)") + class HBaseRegionserver(HadoopHttp): def __init__(self): super(HBaseRegionserver, self).__init__("hbase", "regionserver", "localhost", 60030) def emit_region_metric(self, context, current_time, full_metric_name, value): - match = REGION_METRIC_PATTERN.match(full_metric_name) + match = REGION_METRIC_PATTERN.match(full_metric_name) if not match: utils.err("Error splitting %s" % full_metric_name) return @@ -44,7 +45,7 @@ def emit_region_metric(self, context, current_time, full_metric_name, value): metric_name = match.group(4) tag_dict = {"namespace": namespace, "table": table, "region": region} - if any( not v for k,v in tag_dict.iteritems()): + if any(not v for k, v in tag_dict.items()): utils.err("Error splitting %s" % full_metric_name) else: self.emit_metric(context, current_time, metric_name, value, tag_dict) @@ -58,7 +59,7 @@ def emit(self): current_time = int(time.time()) metrics = self.poll() for context, metric_name, value in metrics: - if any( c in EXCLUDED_CONTEXTS for c in context): + if any(c in EXCLUDED_CONTEXTS for c in context): continue if any(c == "regions" for c in context): @@ -82,5 +83,5 @@ def main(args): if __name__ == "__main__": import sys - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/ifrate.py b/collectors/0/ifrate.py index f41d6bc4..1b493700 100755 --- a/collectors/0/ifrate.py +++ b/collectors/0/ifrate.py @@ -48,23 +48,26 @@ except ImportError: ifrate_conf = None -DEFAULT_COLLECTION_INTERVAL=15 +DEFAULT_COLLECTION_INTERVAL = 15 signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """top main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - if(ifrate_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + if (ifrate_conf): config = ifrate_conf.get_config() - collection_interval=config['collection_interval'] - interfaces=config['interfaces'] - report_packets=config['report_packets'] - merge_err_in_out=config['merge_err_in_out'] + collection_interval = config['collection_interval'] + interfaces = config['interfaces'] + report_packets = config['report_packets'] + merge_err_in_out = config['merge_err_in_out'] global signal_received @@ -80,13 +83,13 @@ def main(): ["netstat", "-I", intname, "-d", "-w", str(collection_interval)], stdout=subprocess.PIPE, )) - intnum+=1 + intnum += 1 else: - sys.exit(13) # we signal tcollector to not run us - except OSError, e: + sys.exit(13) # we signal tcollector to not run us + except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise timestamp = 0 @@ -94,10 +97,10 @@ def main(): while signal_received is None: if (procnum >= intnum): - procnum=0 + procnum = 0 try: line = p_net[procnum].stdout.readline() - except (IOError, OSError), e: + except (IOError, OSError) as e: if e.errno in (errno.EINTR, errno.EAGAIN): break raise @@ -106,28 +109,28 @@ def main(): # end of the program, die break - if (re.match("^[0-9 ]+$",line)): + if (re.match("^[0-9 ]+$", line)): fields = line.split() if len(fields) == 9: - if(procnum == 0): + if (procnum == 0): timestamp = int(time.time()) - print ("ifrate.byt.in %s %s int=%s" % (timestamp, int(fields[3])/collection_interval, interfaces[procnum])) - print ("ifrate.byt.out %s %s int=%s" % (timestamp, int(fields[6])/collection_interval, interfaces[procnum])) - if(report_packets): - print ("ifrate.pkt.in %s %s int=%s" % (timestamp, int(fields[0])/collection_interval, interfaces[procnum])) - print ("ifrate.pkt.out %s %s int=%s" % (timestamp, int(fields[4])/collection_interval, interfaces[procnum])) - if(merge_err_in_out): - print ("ifrate.err %s %s int=%s" % (timestamp, (int(fields[1])+int(fields[5]))/collection_interval, interfaces[procnum])) - print ("ifrate.drp %s %s int=%s" % (timestamp, (int(fields[2])+int(fields[8]))/collection_interval, interfaces[procnum])) + print("ifrate.byt.in %s %s int=%s" % (timestamp, int(fields[3])/collection_interval, interfaces[procnum])) + print("ifrate.byt.out %s %s int=%s" % (timestamp, int(fields[6])/collection_interval, interfaces[procnum])) + if (report_packets): + print("ifrate.pkt.in %s %s int=%s" % (timestamp, int(fields[0])/collection_interval, interfaces[procnum])) + print("ifrate.pkt.out %s %s int=%s" % (timestamp, int(fields[4])/collection_interval, interfaces[procnum])) + if (merge_err_in_out): + print("ifrate.err %s %s int=%s" % (timestamp, (int(fields[1])+int(fields[5]))/collection_interval, interfaces[procnum])) + print("ifrate.drp %s %s int=%s" % (timestamp, (int(fields[2])+int(fields[8]))/collection_interval, interfaces[procnum])) else: - print ("ifrate.err.in %s %s int=%s" % (timestamp, int(fields[1])/collection_interval, interfaces[procnum])) - print ("ifrate.drp.in %s %s int=%s" % (timestamp, int(fields[2])/collection_interval, interfaces[procnum])) - print ("ifrate.err.out %s %s int=%s" % (timestamp, int(fields[5])/collection_interval, interfaces[procnum])) - print ("ifrate.drp.out %s %s int=%s" % (timestamp, int(fields[8])/collection_interval, interfaces[procnum])) - print ("ifrate.col %s %s int=%s" % (timestamp, int(fields[7])/collection_interval, interfaces[procnum])) + print("ifrate.err.in %s %s int=%s" % (timestamp, int(fields[1])/collection_interval, interfaces[procnum])) + print("ifrate.drp.in %s %s int=%s" % (timestamp, int(fields[2])/collection_interval, interfaces[procnum])) + print("ifrate.err.out %s %s int=%s" % (timestamp, int(fields[5])/collection_interval, interfaces[procnum])) + print("ifrate.drp.out %s %s int=%s" % (timestamp, int(fields[8])/collection_interval, interfaces[procnum])) + print("ifrate.col %s %s int=%s" % (timestamp, int(fields[7])/collection_interval, interfaces[procnum])) # analyze next process - procnum+=1 + procnum += 1 sys.stdout.flush() @@ -142,8 +145,9 @@ def main(): p_net[procnum].wait() # If no line at all has been proceeded (wrong interface name ?), we signal tcollector to not run us - if(timestamp == 0): + if (timestamp == 0): exit(13) + if __name__ == "__main__": main() diff --git a/collectors/0/ifstat.py b/collectors/0/ifstat.py index 31c0658a..d7df0693 100755 --- a/collectors/0/ifstat.py +++ b/collectors/0/ifstat.py @@ -45,6 +45,7 @@ METRIC_MAPPING = yaml_conf.load_collector_configuration('node_metrics.yml') + def main(): """ifstat main loop""" @@ -87,14 +88,16 @@ def direction(i): if i >= 8: return "out" return "in" - for i in xrange(16): + + for i in range(16): print("proc.net.%s %d %s iface=%s direction=%s" - % (FIELDS[i], ts, stats[i], intf, direction(i))) + % (FIELDS[i], ts, stats[i], intf, direction(i))) metric_naming.print_if_apptuit_standard_metric("proc.net." + FIELDS[i], METRIC_MAPPING, ts, stats[i], {"iface": intf, "direction": direction(i)}) sys.stdout.flush() time.sleep(interval) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/iostat.py b/collectors/0/iostat.py index 73a54498..fbfb4987 100755 --- a/collectors/0/iostat.py +++ b/collectors/0/iostat.py @@ -79,16 +79,16 @@ # Docs come from the Linux kernel's Documentation/iostats.txt FIELDS_DISK = ( - "read_requests", # Total number of reads completed successfully. - "read_merged", # Adjacent read requests merged in a single req. - "read_sectors", # Total number of sectors read successfully. - "msec_read", # Total number of ms spent by all reads. - "write_requests", # total number of writes completed successfully. - "write_merged", # Adjacent write requests merged in a single req. - "write_sectors", # total number of sectors written successfully. - "msec_write", # Total number of ms spent by all writes. - "ios_in_progress", # Number of actual I/O requests currently in flight. - "msec_total", # Amount of time during which ios_in_progress >= 1. + "read_requests", # Total number of reads completed successfully. + "read_merged", # Adjacent read requests merged in a single req. + "read_sectors", # Total number of sectors read successfully. + "msec_read", # Total number of ms spent by all reads. + "write_requests", # total number of writes completed successfully. + "write_merged", # Adjacent write requests merged in a single req. + "write_sectors", # total number of sectors written successfully. + "msec_write", # Total number of ms spent by all writes. + "ios_in_progress", # Number of actual I/O requests currently in flight. + "msec_total", # Amount of time during which ios_in_progress >= 1. "msec_weighted_total", # Measure of recent I/O completion time and backlog. ) @@ -101,7 +101,9 @@ METRIC_MAPPING = yaml_conf.load_collector_configuration('node_metrics.yml') -prev_times = (0,0) +prev_times = (0, 0) + + def read_uptime(): global prev_times try: @@ -109,7 +111,7 @@ def read_uptime(): line = f_uptime.readline() curr_times = line.split(None) - delta_times = (float(curr_times[0]) - float(prev_times[0]), float(curr_times[1]) - float(prev_times[1])) + delta_times = (float(curr_times[0]) - float(prev_times[0]), float(curr_times[1]) - float(prev_times[1])) prev_times = curr_times return delta_times finally: @@ -184,14 +186,14 @@ def main(): # full stats line for i in range(11): print("%s%s %d %s dev=%s" - % (metric, FIELDS_DISK[i], ts, values[i+3], device)) + % (metric, FIELDS_DISK[i], ts, values[i + 3], device)) metric_naming.print_if_apptuit_standard_metric(metric + FIELDS_DISK[i], METRIC_MAPPING, ts, - values[i+3], tags={"dev": device}, tags_str=None) + values[i + 3], tags={"dev": device}, tags_str=None) ret = is_device(device, 0) # if a device or a partition, calculate the svctm/await/util if ret: - stats = dict(zip(FIELDS_DISK, values[3:])) + stats = dict(list(zip(FIELDS_DISK, values[3:]))) if not device in prev_stats: prev_stats[device] = init_stats rd_ios = float(stats.get("read_requests")) @@ -201,7 +203,8 @@ def main(): prev_wr_ios = float(prev_stats[device].get("write_requests")) prev_nr_ios = prev_rd_ios + prev_wr_ios tput = ((nr_ios - prev_nr_ios) * float(HZ) / float(itv)) - util = ((float(stats.get("msec_total")) - float(prev_stats[device].get("msec_total"))) * float(HZ) / float(itv)) + util = ((float(stats.get("msec_total")) - float(prev_stats[device].get("msec_total"))) * float( + HZ) / float(itv)) svctm = 0.0 await = 0.0 r_await = 0.0 @@ -215,23 +218,24 @@ def main(): prev_rd_ticks = prev_stats[device].get("msec_read") prev_wr_ticks = prev_stats[device].get("msec_write") if rd_ios != prev_rd_ios: - r_await = (float(rd_ticks) - float(prev_rd_ticks) ) / float(rd_ios - prev_rd_ios) + r_await = (float(rd_ticks) - float(prev_rd_ticks)) / float(rd_ios - prev_rd_ios) if wr_ios != prev_wr_ios: - w_await = (float(wr_ticks) - float(prev_wr_ticks) ) / float(wr_ios - prev_wr_ios) + w_await = (float(wr_ticks) - float(prev_wr_ticks)) / float(wr_ios - prev_wr_ios) if nr_ios != prev_nr_ios: - await = (float(rd_ticks) + float(wr_ticks) - float(prev_rd_ticks) - float(prev_wr_ticks)) / float(nr_ios - prev_nr_ios) + await = (float(rd_ticks) + float(wr_ticks) - float(prev_rd_ticks) - float( + prev_wr_ticks)) / float(nr_ios - prev_nr_ios) print("%s%s %d %.2f dev=%s" - % (metric, "svctm", ts, svctm, device)) + % (metric, "svctm", ts, svctm, device)) print("%s%s %d %.2f dev=%s" - % (metric, "r_await", ts, r_await, device)) + % (metric, "r_await", ts, r_await, device)) print("%s%s %d %.2f dev=%s" - % (metric, "w_await", ts, w_await, device)) + % (metric, "w_await", ts, w_await, device)) print("%s%s %d %.2f dev=%s" - % (metric, "await", ts, await, device)) + % (metric, "await", ts, await, device)) util_val = float(util / 1000.0) print("%s%s %d %.2f dev=%s" - % (metric, "util", ts, util_val, device)) - metric_naming.print_if_apptuit_standard_metric(metric+"util", METRIC_MAPPING, ts, + % (metric, "util", ts, util_val, device)) + metric_naming.print_if_apptuit_standard_metric(metric + "util", METRIC_MAPPING, ts, format(round(util_val, 2)), tags={"dev": device}, tags_str=None) @@ -241,9 +245,9 @@ def main(): # partial stats line for i in range(4): print("%s%s %d %s dev=%s" - % (metric, FIELDS_PART[i], ts, values[i+3], device)) + % (metric, FIELDS_PART[i], ts, values[i + 3], device)) else: - print >> sys.stderr, "Cannot parse /proc/diskstats line: ", line + utils.err("Cannot parse /proc/diskstats line: ", line) continue sys.stdout.flush() diff --git a/collectors/0/jolokia.py b/collectors/0/jolokia.py index 3eb536f8..36ec4dd8 100755 --- a/collectors/0/jolokia.py +++ b/collectors/0/jolokia.py @@ -23,6 +23,7 @@ import time import sys import copy + try: import simplejson as json except ImportError: @@ -98,25 +99,25 @@ def print_metrics(self, d, metric_prefix, timestamp, tags, not_tags=[]): """ Take a dict of attributes and print out numerical metric strings Recurse if necessary """ - for k, v in d.iteritems(): + for k, v in d.items(): # Tack on the name of the attribute attribute, more_tags = self.parse_attribute(k.lower(), not_tags) metric_name = '.'.join([metric_prefix, attribute]) my_tags = tags + more_tags # If numerical if utils.is_numeric(v): - print "%s %d %s %s" % (metric_name, timestamp, str(v), - ' '.join(my_tags)) + print("%s %d %s %s" % (metric_name, timestamp, str(v), + ' '.join(my_tags))) # If a bool, True=1, False=0 elif type(v) is bool: - print "%s %d %s %s" % (metric_name, timestamp, str(int(v)), - ' '.join(my_tags)) + print("%s %d %s %s" % (metric_name, timestamp, str(int(v)), + ' '.join(my_tags))) # Or a dict of more attributes, call ourselves again elif type(v) is dict: self.print_metrics(v, metric_name, timestamp, my_tags, not_tags) else: - #lists, strings, etc - #print '# ', type(v), metric_name, str(v) + # lists, strings, etc + # print '# ', type(v), metric_name, str(v) pass def process_data(self): @@ -136,7 +137,7 @@ def process_data(self): if monitor['mbean'] == mbean['request']['mbean']: if mbean['status'] == 200: self.print_metrics(mbean['value'], monitor['metric'], mbean['timestamp'], - monitor['taglist'], monitor['not_tags']) + monitor['taglist'], monitor['not_tags']) break else: utils.err("error: mbean not found - " + monitor['mbean']) @@ -212,6 +213,7 @@ def main(): break # End while True + if __name__ == "__main__": main() diff --git a/collectors/0/memcache.py b/collectors/0/memcache.py index 9a42512d..002570a3 100755 --- a/collectors/0/memcache.py +++ b/collectors/0/memcache.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -import platform import socket import subprocess import sys @@ -8,181 +7,186 @@ from collectors.etc import yaml_conf from collectors.etc import metric_naming +from collectors.lib import utils COLLECTION_INTERVAL = 15 # seconds # Those are the stats we MUST collect at every COLLECTION_INTERVAL. IMPORTANT_STATS = [ - "rusage_user", "rusage_system", - "curr_connections", "total_connections", "connection_structures", - "cmd_get", "cmd_set", - "get_hits", "get_misses", - "delete_misses", "delete_hits", - "bytes_read", "bytes_written", "bytes", - "curr_items", "total_items", "evictions", - ] + "rusage_user", "rusage_system", + "curr_connections", "total_connections", "connection_structures", + "cmd_get", "cmd_set", + "get_hits", "get_misses", + "delete_misses", "delete_hits", + "bytes_read", "bytes_written", "bytes", + "curr_items", "total_items", "evictions", +] IMPORTANT_STATS_SET = set(IMPORTANT_STATS) # Important things on a slab basis IMPORTANT_STATS_SLAB = [ - "cas_hits", "cas_badval", "incr_hits", "decr_hits", "delete_hits", - "cmd_set", "get_hits", "free_chunks", "used_chunks", "total_chunks" + "cas_hits", "cas_badval", "incr_hits", "decr_hits", "delete_hits", + "cmd_set", "get_hits", "free_chunks", "used_chunks", "total_chunks" ] IMPORTANT_STATS_SLAB_SET = set(IMPORTANT_STATS_SLAB) # Stats that really don't belong to the TSDB. IGNORED_STATS_SET = set(["time", "uptime", "version", "pid", "libevent"]) IGNORED_STATS_SLAB_SET = set(["chunk_size", "chunks_per_page", "total_pages", - "mem_requested", "free_chunks_end"]) + "mem_requested", "free_chunks_end"]) # TODO(tsuna): Don't hardcode those. DATASETS = { - 11211: "default", # XXX StumbleUpon specific mapping of port-to-dataset - } + 11211: "default", # XXX StumbleUpon specific mapping of port-to-dataset +} MEMCACHE_NAME_MAPPING = yaml_conf.load_collector_configuration('memcached_metrics.yml') def find_memcached(): - """Yields all the ports that memcached is listening to, according to ps.""" - p = subprocess.Popen(["ps", "-Ao", "args"], stdout=subprocess.PIPE) - stdout, stderr = p.communicate() - assert p.returncode in (0, 1), "ps returned %r" % p.returncode - for line in stdout.split("\n"): - if not line: - continue - - if line.find("memcached") < 0: - continue - - host = line.find(" -l ") - if host < 0: - host = "127.0.0.1" - else: - host = line[host + 4:].split(" ")[0] - # host = socket.inet_pton(socket.AF_INET,host) - - port = line.find(" -p ") - if port < 0: - print >> sys.stderr, "Weird memcached process without a -p argument:", line - continue - port = line[port + 4: line.index(" ", port + 5)] - port = int(port) - if port in DATASETS: - print >> sys.stderr, "Host and port: %s %d" % (host, port) - yield host, port - else: - print >> sys.stderr, "Unknown memached port:", port + """Yields all the ports that memcached is listening to, according to ps.""" + p = subprocess.Popen(["ps", "-Ao", "args"], stdout=subprocess.PIPE) + stdout, stderr = p.communicate() + assert p.returncode in (0, 1), "ps returned %r" % p.returncode + output = stdout.decode("utf-8") + for line in output.split("\n"): + if not line: + continue + + if line.find("memcached") < 0: + continue + + host = line.find(" -l ") + if host < 0: + host = "127.0.0.1" + else: + host = line[host + 4:].split(" ")[0] + # host = socket.inet_pton(socket.AF_INET,host) + + port = line.find(" -p ") + if port < 0: + utils.err("Weird memcached process without a -p argument:", line) + continue + port = line[port + 4: line.index(" ", port + 5)] + port = int(port) + if port in DATASETS: + utils.err("Host and port: %s %d" % (host, port)) + yield host, port + else: + utils.err("Unknown memached port:", port) + def collect_stats(sock): - """Sends the 'stats' command to the socket given in argument.""" - sock.send("stats\r\n") - - in_stats = "" - end_time = time.time() + 1 - - while time.time() < end_time: - in_stats += sock.recv(8192) - stats = [line.rstrip() for line in in_stats.split("\n")] - if stats[-1] == "" and stats[-2] == "END": - break - time.sleep(0.1) - - assert stats[-1] == "", repr(stats) - assert stats[-2] == "END", repr(stats) - # Each line is of the form: STAT statname value - stats = dict(line.split()[1:3] for line in stats[:-2]) - stats["time"] = int(stats["time"]) - return stats + """Sends the 'stats' command to the socket given in argument.""" + sock.send("stats\r\n") + + in_stats = "" + end_time = time.time() + 1 + + while time.time() < end_time: + in_stats += sock.recv(8192) + stats = [line.rstrip() for line in in_stats.split("\n")] + if stats[-1] == "" and stats[-2] == "END": + break + time.sleep(0.1) + + assert stats[-1] == "", repr(stats) + assert stats[-2] == "END", repr(stats) + # Each line is of the form: STAT statname value + stats = dict(line.split()[1:3] for line in stats[:-2]) + stats["time"] = int(stats["time"]) + return stats + def collect_stats_slabs(sock): - """Sends the 'stats slabs' command to the socket given in argument.""" - sock.send("stats slabs\r\n") - - in_stats = "" - end_time = time.time() + 1 - - # The output from 'stats slabs' is long enough that we never get it all in - # the first call to sock.recv. This is a dumb loop that allows us to wait - # a little bit for the data, but doesn't stall the collector forever. - while time.time() < end_time: - in_stats += sock.recv(65536) - stats = [line.rstrip() for line in in_stats.split("\n")] - if stats[-1] == "" and stats[-2] == "END": - break - time.sleep(0.1) - - assert stats[-1] == "", repr(stats) - assert stats[-2] == "END", repr(stats) - - # prep the stats for slabs. note the -4 because there are two lines - # at the bottom we don't want: active_slabs and total_malloced. - out_stats = {} - slabs = dict(line.split()[1:3] for line in stats[:-4]) - for stat in slabs: - slab_id, stat_name = stat.split(":") - if slab_id not in out_stats: - out_stats[slab_id] = {} - out_stats[slab_id][stat_name] = slabs[stat] - return out_stats + """Sends the 'stats slabs' command to the socket given in argument.""" + sock.send("stats slabs\r\n") + + in_stats = "" + end_time = time.time() + 1 + + # The output from 'stats slabs' is long enough that we never get it all in + # the first call to sock.recv. This is a dumb loop that allows us to wait + # a little bit for the data, but doesn't stall the collector forever. + while time.time() < end_time: + in_stats += sock.recv(65536) + stats = [line.rstrip() for line in in_stats.split("\n")] + if stats[-1] == "" and stats[-2] == "END": + break + time.sleep(0.1) + + assert stats[-1] == "", repr(stats) + assert stats[-2] == "END", repr(stats) + + # prep the stats for slabs. note the -4 because there are two lines + # at the bottom we don't want: active_slabs and total_malloced. + out_stats = {} + slabs = dict(line.split()[1:3] for line in stats[:-4]) + for stat in slabs: + slab_id, stat_name = stat.split(":") + if slab_id not in out_stats: + out_stats[slab_id] = {} + out_stats[slab_id][stat_name] = slabs[stat] + return out_stats + def main(args): - """Collects and dumps stats from a memcache server.""" - sockets = {} # Maps a dataset name to a socket connected its memcached. - for host,port in find_memcached(): - dataset = DATASETS[port] - sockets[dataset] = socket.socket() - sockets[dataset].connect((host, port)) - if not sockets: - return 13 # No memcached server running. - - stats = {} # Maps a dataset name to a dict that maps a stats to a value. - slabs = {} # Same, but for slabs. - - def print_stat(stat, dataset): - print ("memcache.%s %d %s dataset=%s" - % (stat, stats[dataset]["time"], stats[dataset][stat], dataset)) - - def print_stat_slab(stat, slab_id, dataset): - # note we purloin 'time' from the stats call above ... - print ("memcache.slab.%s %d %s chunksize=%s dataset=%s" - % (stat, stats[dataset]["time"], slabs[dataset][slab_id][stat], - slabs[dataset][slab_id]["chunk_size"], dataset)) - - while True: - for dataset, sock in sockets.iteritems(): - stats[dataset] = collect_stats(sock) - - # Print all the important stats first. - for stat in IMPORTANT_STATS: - print_stat(stat, dataset) - - for stat in stats[dataset]: - metric_naming.print_if_apptuit_standard_metric("memcache."+stat, MEMCACHE_NAME_MAPPING, - stats[dataset]["time"], stats[dataset][stat], - tags={"dataset" : dataset}, tags_str=None) - - for stat in stats[dataset]: - if (stat not in IMPORTANT_STATS_SET # Don't re-print them. - and stat not in IGNORED_STATS_SET): # Don't record those. - print_stat(stat, dataset) - - # now do above, but for slabs - slabs[dataset] = collect_stats_slabs(sock) - - for slab_id in slabs[dataset]: - for stat in IMPORTANT_STATS_SLAB: - print_stat_slab(stat, slab_id, dataset) - - for stat in slabs[dataset][slab_id]: - if (stat not in IMPORTANT_STATS_SLAB_SET # Don't re-print them. - and stat not in IGNORED_STATS_SLAB_SET): # Don't record those. - print_stat_slab(stat, slab_id, dataset) - - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) + """Collects and dumps stats from a memcache server.""" + sockets = {} # Maps a dataset name to a socket connected its memcached. + for host, port in find_memcached(): + dataset = DATASETS[port] + sockets[dataset] = socket.socket() + sockets[dataset].connect((host, port)) + if not sockets: + return 13 # No memcached server running. + + stats = {} # Maps a dataset name to a dict that maps a stats to a value. + slabs = {} # Same, but for slabs. + + def print_stat(stat, dataset): + print("memcache.%s %d %s dataset=%s" + % (stat, stats[dataset]["time"], stats[dataset][stat], dataset)) + + def print_stat_slab(stat, slab_id, dataset): + # note we purloin 'time' from the stats call above ... + print("memcache.slab.%s %d %s chunksize=%s dataset=%s" + % (stat, stats[dataset]["time"], slabs[dataset][slab_id][stat], + slabs[dataset][slab_id]["chunk_size"], dataset)) + + while True: + for dataset, sock in sockets.items(): + stats[dataset] = collect_stats(sock) + + # Print all the important stats first. + for stat in IMPORTANT_STATS: + print_stat(stat, dataset) + + for stat in stats[dataset]: + metric_naming.print_if_apptuit_standard_metric("memcache." + stat, MEMCACHE_NAME_MAPPING, + stats[dataset]["time"], stats[dataset][stat], + tags={"dataset": dataset}, tags_str=None) + + for stat in stats[dataset]: + if (stat not in IMPORTANT_STATS_SET # Don't re-print them. + and stat not in IGNORED_STATS_SET): # Don't record those. + print_stat(stat, dataset) + + # now do above, but for slabs + slabs[dataset] = collect_stats_slabs(sock) + + for slab_id in slabs[dataset]: + for stat in IMPORTANT_STATS_SLAB: + print_stat_slab(stat, slab_id, dataset) + + for stat in slabs[dataset][slab_id]: + if (stat not in IMPORTANT_STATS_SLAB_SET # Don't re-print them. + and stat not in IGNORED_STATS_SLAB_SET): # Don't record those. + print_stat_slab(stat, slab_id, dataset) + + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main(sys.argv)) + sys.stdin.close() + sys.exit(main(sys.argv)) diff --git a/collectors/0/mongo.py b/collectors/0/mongo.py index b4e652eb..9979f8b9 100755 --- a/collectors/0/mongo.py +++ b/collectors/0/mongo.py @@ -15,6 +15,7 @@ import sys import time + try: import pymongo except ImportError: @@ -56,15 +57,16 @@ 'network.numRequests', ) TAG_METRICS = ( - ('asserts', ('msg', 'regular', 'user', 'warning')), - ('opcounters', ('command', 'delete', 'getmore', 'insert', 'query', 'update')), + ('asserts', ('msg', 'regular', 'user', 'warning')), + ('opcounters', ('command', 'delete', 'getmore', 'insert', 'query', 'update')), ) + def main(): utils.drop_privileges() if pymongo is None: - print >>sys.stderr, "error: Python module `pymongo' is missing" - return 13 + utils.err("error: Python module `pymongo' is missing") + return 13 c = pymongo.Connection(host=HOST, port=PORT) @@ -74,8 +76,8 @@ def main(): for base_metric, tags in TAG_METRICS: for tag in tags: - print 'mongo.%s %d %s type=%s' % (base_metric, ts, - res[base_metric][tag], tag) + print('mongo.%s %d %s type=%s' % (base_metric, ts, + res[base_metric][tag], tag)) for metric in METRICS: cur = res try: @@ -83,10 +85,11 @@ def main(): cur = cur[m] except KeyError: continue - print 'mongo.%s %d %s' % (metric, ts, cur) + print('mongo.%s %d %s' % (metric, ts, cur)) sys.stdout.flush() time.sleep(INTERVAL) + if __name__ == '__main__': sys.exit(main()) diff --git a/collectors/0/mongo3.py b/collectors/0/mongo3.py index 41f490d4..40ba6931 100755 --- a/collectors/0/mongo3.py +++ b/collectors/0/mongo3.py @@ -16,7 +16,7 @@ import sys import time -import os + try: import pymongo except ImportError: @@ -180,6 +180,7 @@ 'uptime' ) + def runServerStatus(c): res = c.admin.command('serverStatus') ts = int(time.time()) @@ -191,7 +192,7 @@ def runServerStatus(c): cur = cur[m] except KeyError: continue - print 'mongo.%s %d %s' % (metric, ts, cur) + print('mongo.%s %d %s' % (metric, ts, cur)) for metric in CONFIG_LOCKS_METRICS: cur = res @@ -201,7 +202,8 @@ def runServerStatus(c): except KeyError: continue for k, v in cur.items(): - print 'mongo.%s %d %s mode=%s' % (metric, ts, v, k) + print('mongo.%s %d %s mode=%s' % (metric, ts, v, k)) + def runDbStats(c): for db_name in DB_NAMES: @@ -215,7 +217,7 @@ def runDbStats(c): cur = cur[m] except KeyError: continue - print 'mongo.db.%s %d %s db=%s' % (metric, ts, cur, db_name) + print('mongo.db.%s %d %s db=%s' % (metric, ts, cur, db_name)) raw_metrics = res['raw'] for key, value in raw_metrics.items(): @@ -229,7 +231,8 @@ def runDbStats(c): cur = cur[m] except KeyError: continue - print 'mongo.rs.%s %d %s replica=%s db=%s' % (metric, ts, cur, replica_name, db_name) + print('mongo.rs.%s %d %s replica=%s db=%s' % (metric, ts, cur, replica_name, db_name)) + def runReplSetGetStatus(c): res = c.admin.command('replSetGetStatus') @@ -254,7 +257,7 @@ def runReplSetGetStatus(c): cur = cur[m] except KeyError: continue - print 'mongo.replica.%s %d %s replica_set=%s replica=%s replica_state=%s replica_health=%s' % (metric, ts, cur, replica_set_name, replica_name, replica_state, replica_health) + print('mongo.replica.%s %d %s replica_set=%s replica=%s replica_state=%s replica_health=%s' % (metric, ts, cur, replica_set_name, replica_name, replica_state, replica_health)) def loadEnv(): global USER, PASS, INTERVAL, DB_NAMES, CONFIG_CONN, MONGOS_CONN, REPLICA_CONN @@ -279,12 +282,13 @@ def loadEnv(): PASS = mongodb3_conf.get_settings()['password'] INTERVAL = mongodb3_conf.get_settings()['interval'] + def main(): loadEnv() utils.drop_privileges() if pymongo is None: - print >>sys.stderr, "error: Python module `pymongo' is missing" + utils.err("error: Python module `pymongo' is missing") return 13 for index, item in enumerate(CONFIG_CONN, start=0): @@ -318,5 +322,6 @@ def main(): sys.stdout.flush() time.sleep(INTERVAL) + if __name__ == '__main__': sys.exit(main()) diff --git a/collectors/0/mountstats.py b/collectors/0/mountstats.py index 93f6fbdb..5a5bfa7f 100755 --- a/collectors/0/mountstats.py +++ b/collectors/0/mountstats.py @@ -88,14 +88,17 @@ COLLECTION_INTERVAL = 10 # seconds # BYTES_FIELDS is individual fields in the 'bytes: ' line -BYTES_FIELDS = ['normalread', 'normalwrite', 'directread', 'directwrite', 'serverread', 'serverwrite', 'readpages', 'writepages'] +BYTES_FIELDS = ['normalread', 'normalwrite', 'directread', 'directwrite', 'serverread', 'serverwrite', 'readpages', + 'writepages'] # KEY_METRICS contains the RPC call metrics we want specific data for KEY_METRICS = ['GETATTR', 'ACCESS', 'READ', 'WRITE'] # OTHER_METRICS contains the other RPC call we will aggregate as 'OTHER' -OTHER_METRICS = ['SETATTR', 'LOOKUP', 'READLINK', 'CREATE', 'MKDIR', 'SYMLINK', 'MKNOD', 'REMOVE', 'RMDIR', 'RENAME', 'LINK', 'READDIR', 'READDIRPLUS', 'FSSTAT', 'FSINFO', 'PATHCONF', 'COMMIT'] +OTHER_METRICS = ['SETATTR', 'LOOKUP', 'READLINK', 'CREATE', 'MKDIR', 'SYMLINK', 'MKNOD', 'REMOVE', 'RMDIR', 'RENAME', + 'LINK', 'READDIR', 'READDIRPLUS', 'FSSTAT', 'FSINFO', 'PATHCONF', 'COMMIT'] # RPC_FIELDS is the individual metric fields on the RPC metric lines RPC_FIELDS = ['ops', 'txs', 'timeouts', 'txbytes', 'rxbytes', 'qtime', 'rttime', 'totaltime'] + def main(): """nfsstats main loop.""" try: @@ -107,7 +110,7 @@ def main(): device = None f_nfsstats.seek(0) ts = int(time.time()) - rpc_metrics = { } + rpc_metrics = {} for line in f_nfsstats: values = line.split(None) if len(values) == 0: @@ -120,13 +123,13 @@ def main(): mountpoint = values[4] mount = mount.rstrip("/") device = hostname + mount + mountpoint - rpc_metrics[device] = { } - rpc_metrics[device]['other'] = dict((x,0) for x in RPC_FIELDS) + rpc_metrics[device] = {} + rpc_metrics[device]['other'] = dict((x, 0) for x in RPC_FIELDS) rpc_metrics[device]['nfshost'] = hostname rpc_metrics[device]['nfsvol'] = mount - rpc_metrics[device]['mounts'] = [ mount ] + rpc_metrics[device]['mounts'] = [mount] for metric in KEY_METRICS: - rpc_metrics[device][metric] = dict((x,0) for x in RPC_FIELDS) + rpc_metrics[device][metric] = dict((x, 0) for x in RPC_FIELDS) if device == None: continue @@ -145,24 +148,24 @@ def main(): if m in rpc_metrics: # metrics already counted, mark as dupe ignore dupe = True - first_device=rpc_metrics[m] + first_device = rpc_metrics[m] rpc_metrics[first_device]['mounts'].append(mount) rpc_metrics[device]['dupe'] = True else: rpc_metrics[m] = device if field == "bytes": - rpc_metrics[device]['bytes'] = dict((BYTES_FIELDS[i], values[i+1]) for i in range(0, len(BYTES_FIELDS))) + rpc_metrics[device]['bytes'] = dict( + (BYTES_FIELDS[i], values[i + 1]) for i in range(0, len(BYTES_FIELDS))) if field in KEY_METRICS: for i in range(1, len(RPC_FIELDS) + 1): metric = field - rpc_metrics[device][metric][RPC_FIELDS[i-1]] += int(values[i]) + rpc_metrics[device][metric][RPC_FIELDS[i - 1]] += int(values[i]) if field in OTHER_METRICS: for i in range(1, len(RPC_FIELDS) + 1): - rpc_metrics[device]['other'][RPC_FIELDS[i-1]] += int(values[i]) - + rpc_metrics[device]['other'][RPC_FIELDS[i - 1]] += int(values[i]) for device in rpc_metrics: # Skip the duplicates @@ -174,16 +177,15 @@ def main(): nfshost = rpc_metrics[device]['nfshost'] rpc_metrics[device]['mounts'].sort() nfsvol = rpc_metrics[device]['mounts'][0] - for metric in KEY_METRICS+['other']: + for metric in KEY_METRICS + ['other']: for field in rpc_metrics[device][metric]: - print "proc.mountstats.%s.%s %d %s nfshost=%s nfsvol=%s" % (metric.lower(), field.lower(), ts, rpc_metrics[device][metric][field], nfshost, nfsvol) + print("proc.mountstats.%s.%s %d %s nfshost=%s nfsvol=%s" % (metric.lower(), field.lower(), ts, rpc_metrics[device][metric][field], nfshost, nfsvol)) for field in BYTES_FIELDS: - print "proc.mountstats.bytes.%s %d %s nfshost=%s nfsvol=%s" % (field.lower(), ts, rpc_metrics[device]['bytes'][field], nfshost, nfsvol) + print("proc.mountstats.bytes.%s %d %s nfshost=%s nfsvol=%s" % (field.lower(), ts, rpc_metrics[device]['bytes'][field], nfshost, nfsvol)) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) - if __name__ == "__main__": main() diff --git a/collectors/0/mysql.py b/collectors/0/mysql.py index ec02d47a..5ef5f755 100755 --- a/collectors/0/mysql.py +++ b/collectors/0/mysql.py @@ -28,9 +28,9 @@ import time try: - import MySQLdb + import MySQLdb except ImportError: - MySQLdb = None # This is handled gracefully in main() + MySQLdb = None # This is handled gracefully in main() from collectors.etc import mysqlconf from collectors.lib import utils @@ -43,377 +43,385 @@ DB_REFRESH_INTERVAL = 60 # seconds # Usual locations where to find the default socket file. DEFAULT_SOCKFILES = set([ - "/tmp/mysql.sock", # MySQL's own default. - "/var/lib/mysql/mysql.sock", # RH-type / RPM systems. - "/var/run/mysqld/mysqld.sock", # Debian-type systems. + "/tmp/mysql.sock", # MySQL's own default. + "/var/lib/mysql/mysql.sock", # RH-type / RPM systems. + "/var/run/mysqld/mysqld.sock", # Debian-type systems. ]) # Directories under which to search additional socket files. SEARCH_DIRS = [ - "/var/lib/mysql", + "/var/lib/mysql", ] METRIC_MAPPING = yaml_conf.load_collector_configuration('mysql_metrics.yml') + class DB(object): - """Represents a MySQL server (as we can monitor more than 1 MySQL).""" - - def __init__(self, db_connection_props, db_config_host, db_custom_tags, dbname, db, cursor, version): - self.dbname = dbname - self.db = db - self.cursor = cursor - self.version = version - self.master = None - self.slave_bytes_executed = None - self.relay_bytes_relayed = None - - version = version.split(".") - try: - self.major = int(version[0]) - self.medium = int(version[1]) - except (ValueError, IndexError), e: - self.major = self.medium = 0 - - self.remotehostconnect = True - self.db_connection_props = db_connection_props - self.db_config_host = db_config_host - self.db_custom_tags = db_custom_tags - - def __str__(self): - return "DB(%r, %r, version=%r)" % (self.db_config_host, self.dbname, - self.version) - - def __repr__(self): - return self.__str__() - - def isShowGlobalStatusSafe(self): - """Returns whether or not SHOW GLOBAL STATUS is safe to run.""" - # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it - # locks the entire database for too long and severely impacts traffic. - return self.major > 5 or (self.major == 5 and self.medium >= 1) - - def query(self, sql): - """Executes the given SQL statement and returns a sequence of rows.""" - assert self.cursor, "%s already closed?" % (self,) - try: - self.cursor.execute(sql) - except MySQLdb.OperationalError, (errcode, msg): - if errcode != 2006: # "MySQL server has gone away" - raise - self._reconnect() - return self.cursor.fetchall() - - def close(self): - """Closes the connection to this MySQL server.""" - if self.cursor: - self.cursor.close() - self.cursor = None - if self.db: - self.db.close() - self.db = None - - def _reconnect(self): - """Reconnects to this MySQL server.""" - self.close() - self.db = mysql_connect(self.db_connection_props) - self.cursor = self.db.cursor() + """Represents a MySQL server (as we can monitor more than 1 MySQL).""" + + def __init__(self, db_connection_props, db_config_host, db_custom_tags, dbname, db, cursor, version): + self.dbname = dbname + self.db = db + self.cursor = cursor + self.version = version + self.master = None + self.slave_bytes_executed = None + self.relay_bytes_relayed = None + + version = version.split(".") + try: + self.major = int(version[0]) + self.medium = int(version[1]) + except (ValueError, IndexError) as e: + self.major = self.medium = 0 + + self.remotehostconnect = True + self.db_connection_props = db_connection_props + self.db_config_host = db_config_host + self.db_custom_tags = db_custom_tags + + def __str__(self): + return "DB(%r, %r, version=%r)" % (self.db_config_host, self.dbname, + self.version) + + def __repr__(self): + return self.__str__() + + def isShowGlobalStatusSafe(self): + """Returns whether or not SHOW GLOBAL STATUS is safe to run.""" + # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it + # locks the entire database for too long and severely impacts traffic. + return self.major > 5 or (self.major == 5 and self.medium >= 1) + + def query(self, sql): + """Executes the given SQL statement and returns a sequence of rows.""" + assert self.cursor, "%s already closed?" % (self,) + try: + self.cursor.execute(sql) + except MySQLdb.OperationalError as mysql_op_err: + (errcode, msg) = mysql_op_err.args + if errcode != 2006: # "MySQL server has gone away" + raise + self._reconnect() + return self.cursor.fetchall() + + def close(self): + """Closes the connection to this MySQL server.""" + if self.cursor: + self.cursor.close() + self.cursor = None + if self.db: + self.db.close() + self.db = None + + def _reconnect(self): + """Reconnects to this MySQL server.""" + self.close() + self.db = mysql_connect(self.db_connection_props) + self.cursor = self.db.cursor() def mysql_connect(conn_props): - """Connects to the MySQL server using the specified connection properties.""" - return MySQLdb.connect(host=conn_props[0], - port=conn_props[1], - user=conn_props[2], passwd=conn_props[3]) + """Connects to the MySQL server using the specified connection properties.""" + return MySQLdb.connect(host=conn_props[0], + port=conn_props[1], + user=conn_props[2], passwd=conn_props[3]) def todict(db, row): - """Transforms a row (returned by DB.query) into a dict keyed by column names. - - Args: - db: The DB instance from which this row was obtained. - row: A row as returned by DB.query - """ - d = {} - for i, field in enumerate(db.cursor.description): - column = field[0].lower() # Lower-case to normalize field names. - d[column] = row[i] - return d + """Transforms a row (returned by DB.query) into a dict keyed by column names. + + Args: + db: The DB instance from which this row was obtained. + row: A row as returned by DB.query + """ + d = {} + for i, field in enumerate(db.cursor.description): + column = field[0].lower() # Lower-case to normalize field names. + d[column] = row[i] + return d + def get_dbname(sockfile): - """Returns the name of the DB based on the path to the socket file.""" - if sockfile in DEFAULT_SOCKFILES: - return "default" - m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile) - if not m: - utils.err("error: couldn't guess the name of the DB for " + sockfile) - return None - return m.group(1) + """Returns the name of the DB based on the path to the socket file.""" + if sockfile in DEFAULT_SOCKFILES: + return "default" + m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile) + if not m: + utils.err("error: couldn't guess the name of the DB for " + sockfile) + return None + return m.group(1) def find_sockfiles(): - """Returns a list of paths to socket files to monitor.""" - paths = [] - # Look for socket files. - for dir in SEARCH_DIRS: - if not os.path.isdir(dir) or not os.access(dir, os.R_OK): - continue - for name in os.listdir(dir): - subdir = os.path.join(dir, name) - if not os.path.isdir(subdir) or not os.access(subdir, os.R_OK): - continue - for subname in os.listdir(subdir): - path = os.path.join(subdir, subname) - if utils.is_sockfile(path): - paths.append(path) - break # We only expect 1 socket file per DB, so get out. - # Try the default locations. - for sockfile in DEFAULT_SOCKFILES: - if not utils.is_sockfile(sockfile): - continue - paths.append(sockfile) - return paths + """Returns a list of paths to socket files to monitor.""" + paths = [] + # Look for socket files. + for dir in SEARCH_DIRS: + if not os.path.isdir(dir) or not os.access(dir, os.R_OK): + continue + for name in os.listdir(dir): + subdir = os.path.join(dir, name) + if not os.path.isdir(subdir) or not os.access(subdir, os.R_OK): + continue + for subname in os.listdir(subdir): + path = os.path.join(subdir, subname) + if utils.is_sockfile(path): + paths.append(path) + break # We only expect 1 socket file per DB, so get out. + # Try the default locations. + for sockfile in DEFAULT_SOCKFILES: + if not utils.is_sockfile(sockfile): + continue + paths.append(sockfile) + return paths + def die(): - exit(13) + exit(13) + def find_databases(dbs=None): - """Returns a map of dbname (string) to DB instances to monitor. - - Args: - dbs: A map of dbname (string) to DB instances already monitored. - This map will be modified in place if it's not None. - """ - if dbs is None: - dbs = {} - for db_config_host in mysqlconf.get_db_hosts(): - if db_config_host in dbs: - continue - conn_props = mysqlconf.get_db_connection_properties(db_config_host) - db_name = "default" - try: - db = mysql_connect(conn_props) - cursor = db.cursor() - cursor.execute("SELECT VERSION()") - connected = True - except (EnvironmentError, EOFError, RuntimeError, socket.error, - MySQLdb.MySQLError), e: - utils.err("Couldn't connect to %s: %s" % (conn_props[2] + "@" + db_config_host + ":" + str(conn_props[1]), e)) - continue - - if connected: - version = cursor.fetchone()[0] - dbs[db_config_host] = DB(db_connection_props=conn_props, db_config_host=db_config_host, - db_custom_tags=mysqlconf.get_db_custom_tags(db_config_host), - dbname=db_name, db=db, cursor=cursor, version=version) - - if not dbs: - die() - - return dbs + """Returns a map of dbname (string) to DB instances to monitor. + + Args: + dbs: A map of dbname (string) to DB instances already monitored. + This map will be modified in place if it's not None. + """ + if dbs is None: + dbs = {} + for db_config_host in mysqlconf.get_db_hosts(): + if db_config_host in dbs: + continue + conn_props = mysqlconf.get_db_connection_properties(db_config_host) + db_name = "default" + try: + db = mysql_connect(conn_props) + cursor = db.cursor() + cursor.execute("SELECT VERSION()") + connected = True + except (EnvironmentError, EOFError, RuntimeError, socket.error, + MySQLdb.MySQLError) as e: + utils.err("Couldn't connect to %s: %s" % (conn_props[2] + "@" + db_config_host + ":" + str(conn_props[1]), e)) + continue + + if connected: + version = cursor.fetchone()[0] + dbs[db_config_host] = DB(db_connection_props=conn_props, db_config_host=db_config_host, + db_custom_tags=mysqlconf.get_db_custom_tags(db_config_host), + dbname=db_name, db=db, cursor=cursor, version=version) + + if not dbs: + die() + + return dbs def now(): - return int(time.time()) + return int(time.time()) def isyes(s): - if s.lower() == "yes": - return 1 - return 0 + if s.lower() == "yes": + return 1 + return 0 def collectInnodbStatus(db): - """Collects and prints InnoDB stats about the given DB instance.""" - ts = now() - def printmetric(metric, value, tags=""): - print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags) - metric_naming.print_if_apptuit_standard_metric("mysql." + metric, METRIC_MAPPING, ts, value, - tags=mysqlconf.get_db_custom_tags(db.db_config_host), - tags_str="schema=" + db.dbname + tags) - - innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2] - m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$", - innodb_status, re.M) - if m: # If we have it, try to use InnoDB's own timestamp. - ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S"))) - - line = None - def match(regexp): - return re.match(regexp, line) - - for line in innodb_status.split("\n"): - # SEMAPHORES - m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)") - if m: - printmetric("innodb.oswait_array.reservation_count", m.group(1)) - printmetric("innodb.oswait_array.signal_count", m.group(2)) - continue - m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)") - if m: - printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex") - printmetric("innodb.locks.rounds", m.group(2), " type=mutex") - printmetric("innodb.locks.os_waits", m.group(3), " type=mutex") - continue - m = match("RW-shared spins (\d+), OS waits (\d+);" - " RW-excl spins (\d+), OS waits (\d+)") - if m: - printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared") - printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared") - printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive") - printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive") - continue - # GG 20141015 - RW-shared and RW-excl got separate lines and rounds in 5.5+ - m = match("RW-shared spins (\d+), rounds (\d+), OS waits (\d+)") - if m: - printmetric("locks.spin_waits", m.group(1), " type=rw-shared") - printmetric("locks.rounds", m.group(2), " type=rw-shared") - printmetric("locks.os_waits", m.group(3), " type=rw-shared") - continue - m = match("RW-excl spins (\d+), rounds (\d+), OS waits (\d+)") - if m: - printmetric("locks.spin_waits", m.group(1), " type=rw-exclusive") - printmetric("locks.rounds", m.group(2), " type=rw-exclusive") - printmetric("locks.os_waits", m.group(3), " type=rw-exclusive") - continue - # INSERT BUFFER AND ADAPTIVE HASH INDEX - # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and - # the following one can appear multiple times. I've never seen this. - # If that happens, we need to aggregate the values here instead of - # printing them directly. - m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),") - if m: - printmetric("innodb.ibuf.size", m.group(1)) - printmetric("innodb.ibuf.free_list_len", m.group(2)) - printmetric("innodb.ibuf.seg_size", m.group(3)) - continue - m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges") - if m: - printmetric("innodb.ibuf.inserts", m.group(1)) - printmetric("innodb.ibuf.merged_recs", m.group(2)) - printmetric("innodb.ibuf.merges", m.group(3)) - continue - # ROW OPERATIONS - m = match("\d+ queries inside InnoDB, (\d+) queries in queue") - if m: - printmetric("innodb.queries_queued", m.group(1)) - continue - m = match("(\d+) read views open inside InnoDB") - if m: - printmetric("innodb.opened_read_views", m.group(1)) - continue - # TRANSACTION - m = match("History list length (\d+)") - if m: - printmetric("innodb.history_list_length", m.group(1)) - continue + """Collects and prints InnoDB stats about the given DB instance.""" + ts = now() + + def printmetric(metric, value, tags=""): + print("mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)) + metric_naming.print_if_apptuit_standard_metric("mysql." + metric, METRIC_MAPPING, ts, value, + tags=mysqlconf.get_db_custom_tags(db.db_config_host), + tags_str="schema=" + db.dbname + tags) + + innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2] + m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$", + innodb_status, re.M) + if m: # If we have it, try to use InnoDB's own timestamp. + ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S"))) + + line = None + + def match(regexp): + return re.match(regexp, line) + + for line in innodb_status.split("\n"): + # SEMAPHORES + m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)") + if m: + printmetric("innodb.oswait_array.reservation_count", m.group(1)) + printmetric("innodb.oswait_array.signal_count", m.group(2)) + continue + m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)") + if m: + printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex") + printmetric("innodb.locks.rounds", m.group(2), " type=mutex") + printmetric("innodb.locks.os_waits", m.group(3), " type=mutex") + continue + m = match("RW-shared spins (\d+), OS waits (\d+);" + " RW-excl spins (\d+), OS waits (\d+)") + if m: + printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared") + printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared") + printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive") + printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive") + continue + # GG 20141015 - RW-shared and RW-excl got separate lines and rounds in 5.5+ + m = match("RW-shared spins (\d+), rounds (\d+), OS waits (\d+)") + if m: + printmetric("locks.spin_waits", m.group(1), " type=rw-shared") + printmetric("locks.rounds", m.group(2), " type=rw-shared") + printmetric("locks.os_waits", m.group(3), " type=rw-shared") + continue + m = match("RW-excl spins (\d+), rounds (\d+), OS waits (\d+)") + if m: + printmetric("locks.spin_waits", m.group(1), " type=rw-exclusive") + printmetric("locks.rounds", m.group(2), " type=rw-exclusive") + printmetric("locks.os_waits", m.group(3), " type=rw-exclusive") + continue + # INSERT BUFFER AND ADAPTIVE HASH INDEX + # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and + # the following one can appear multiple times. I've never seen this. + # If that happens, we need to aggregate the values here instead of + # printing them directly. + m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),") + if m: + printmetric("innodb.ibuf.size", m.group(1)) + printmetric("innodb.ibuf.free_list_len", m.group(2)) + printmetric("innodb.ibuf.seg_size", m.group(3)) + continue + m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges") + if m: + printmetric("innodb.ibuf.inserts", m.group(1)) + printmetric("innodb.ibuf.merged_recs", m.group(2)) + printmetric("innodb.ibuf.merges", m.group(3)) + continue + # ROW OPERATIONS + m = match("\d+ queries inside InnoDB, (\d+) queries in queue") + if m: + printmetric("innodb.queries_queued", m.group(1)) + continue + m = match("(\d+) read views open inside InnoDB") + if m: + printmetric("innodb.opened_read_views", m.group(1)) + continue + # TRANSACTION + m = match("History list length (\d+)") + if m: + printmetric("innodb.history_list_length", m.group(1)) + continue def collect(db): - """Collects and prints stats about the given DB instance.""" - - ts = now() - def printmetric(metric, value, tags=""): - print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags) - metric_naming.print_if_apptuit_standard_metric("mysql."+metric, METRIC_MAPPING, ts, value, - tags=mysqlconf.get_db_custom_tags(db.db_config_host), - tags_str="schema=" + db.dbname + tags) - - has_innodb = False - if db.isShowGlobalStatusSafe(): - for metric, value in db.query("SHOW GLOBAL STATUS"): - try: - if "." in value: - value = float(value) - else: - value = int(value) - except ValueError: - continue - metric = metric.lower() - has_innodb = has_innodb or metric.startswith("innodb") - printmetric(metric, value) - - if has_innodb: - collectInnodbStatus(db) - - if has_innodb and False: # Disabled because it's too expensive for InnoDB. - waits = {} # maps a mutex name to the number of waits + """Collects and prints stats about the given DB instance.""" + ts = now() - for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"): - if not status.startswith("os_waits"): - continue - m = re.search("&(\w+)(?:->(\w+))?$", mutex) - if not m: - continue - mutex, kind = m.groups() - if kind: - mutex += "." + kind - wait_count = int(status.split("=", 1)[1]) - waits[mutex] = waits.get(mutex, 0) + wait_count - for mutex, wait_count in waits.iteritems(): - printmetric("innodb.locks", wait_count, " mutex=" + mutex) - - ts = now() - - mysql_slave_status = db.query("SHOW SLAVE STATUS") - if mysql_slave_status: - slave_status = todict(db, mysql_slave_status[0]) - master_host = slave_status["master_host"] - else: - master_host = None - - if master_host and master_host != "None": - sbm = slave_status.get("seconds_behind_master") - if isinstance(sbm, (int, long)): - printmetric("slave.seconds_behind_master", sbm) - printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"]) - printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"]) - printmetric("slave.thread_io_running", - isyes(slave_status["slave_io_running"])) - printmetric("slave.thread_sql_running", - isyes(slave_status["slave_sql_running"])) - - states = {} # maps a connection state to number of connections in that state - for row in db.query("SHOW PROCESSLIST"): - id, user, host, db_, cmd, time, state = row[:7] - states[cmd] = states.get(cmd, 0) + 1 - for state, count in states.iteritems(): - state = state.lower().replace(" ", "_") - printmetric("connection_states", count, " state=%s" % state) + def printmetric(metric, value, tags=""): + print("mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)) + metric_naming.print_if_apptuit_standard_metric("mysql." + metric, METRIC_MAPPING, ts, value, + tags=mysqlconf.get_db_custom_tags(db.db_config_host), + tags_str="schema=" + db.dbname + tags) + + has_innodb = False + if db.isShowGlobalStatusSafe(): + for metric, value in db.query("SHOW GLOBAL STATUS"): + try: + if "." in value: + value = float(value) + else: + value = int(value) + except ValueError: + continue + metric = metric.lower() + has_innodb = has_innodb or metric.startswith("innodb") + printmetric(metric, value) + + if has_innodb: + collectInnodbStatus(db) + + if has_innodb and False: # Disabled because it's too expensive for InnoDB. + waits = {} # maps a mutex name to the number of waits + ts = now() + for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"): + if not status.startswith("os_waits"): + continue + m = re.search("&(\w+)(?:->(\w+))?$", mutex) + if not m: + continue + mutex, kind = m.groups() + if kind: + mutex += "." + kind + wait_count = int(status.split("=", 1)[1]) + waits[mutex] = waits.get(mutex, 0) + wait_count + for mutex, wait_count in waits.items(): + printmetric("innodb.locks", wait_count, " mutex=" + mutex) -def main(args): - """Collects and dumps stats from a MySQL server.""" - if MySQLdb is None: - utils.err("error: Python module `MySQLdb' is missing") - return 1 - - last_db_refresh = now() - dbs = find_databases() - while True: ts = now() - if ts - last_db_refresh >= DB_REFRESH_INTERVAL: - find_databases(dbs) - last_db_refresh = ts - - errs = [] - for dbname, db in dbs.iteritems(): - try: - collect(db) - except (EnvironmentError, EOFError, RuntimeError, socket.error, - MySQLdb.MySQLError), e: - if isinstance(e, IOError) and e[0] == errno.EPIPE: - # Exit on a broken pipe. There's no point in continuing - # because no one will read our stdout anyway. - return 2 - utils.err("error: failed to collect data from %s: %s" % (db, e)) - errs.append(dbname) - - for dbname in errs: - del dbs[dbname] - - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) + + mysql_slave_status = db.query("SHOW SLAVE STATUS") + if mysql_slave_status: + slave_status = todict(db, mysql_slave_status[0]) + master_host = slave_status["master_host"] + else: + master_host = None + + if master_host and master_host != "None": + sbm = slave_status.get("seconds_behind_master") + if isinstance(sbm, int): + printmetric("slave.seconds_behind_master", sbm) + printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"]) + printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"]) + printmetric("slave.thread_io_running", + isyes(slave_status["slave_io_running"])) + printmetric("slave.thread_sql_running", + isyes(slave_status["slave_sql_running"])) + + states = {} # maps a connection state to number of connections in that state + for row in db.query("SHOW PROCESSLIST"): + id, user, host, db_, cmd, time, state = row[:7] + states[cmd] = states.get(cmd, 0) + 1 + for state, count in states.items(): + state = state.lower().replace(" ", "_") + printmetric("connection_states", count, " state=%s" % state) + + +def main(args): + """Collects and dumps stats from a MySQL server.""" + if MySQLdb is None: + utils.err("error: Python module `MySQLdb' is missing") + return 1 + + last_db_refresh = now() + dbs = find_databases() + while True: + ts = now() + if ts - last_db_refresh >= DB_REFRESH_INTERVAL: + find_databases(dbs) + last_db_refresh = ts + + errs = [] + for dbname, db in dbs.items(): + try: + collect(db) + except (EnvironmentError, EOFError, RuntimeError, socket.error, + MySQLdb.MySQLError) as e: + if isinstance(e, IOError) and e[0] == errno.EPIPE: + # Exit on a broken pipe. There's no point in continuing + # because no one will read our stdout anyway. + return 2 + utils.err("error: failed to collect data from %s: %s" % (db, e)) + errs.append(dbname) + + for dbname in errs: + del dbs[dbname] + + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main(sys.argv)) + sys.stdin.close() + sys.exit(main(sys.argv)) diff --git a/collectors/0/netstat.py b/collectors/0/netstat.py index 2c00dbbe..d8ab0a95 100755 --- a/collectors/0/netstat.py +++ b/collectors/0/netstat.py @@ -76,8 +76,8 @@ def main(): sockstat = open("/proc/net/sockstat") netstat = open("/proc/net/netstat") snmp = open("/proc/net/snmp") - except IOError, e: - print >>sys.stderr, "open failed: %s" % e + except IOError as e: + utils.err("open failed: %s" % e) return 13 # Ask tcollector to not re-start us. utils.drop_privileges() @@ -101,8 +101,7 @@ def main(): def print_sockstat(metric, value, tags=""): # Note: tags must start with ' ' if value is not None: - print "net.sockstat.%s %d %s%s" % (metric, ts, value, tags) - + print("net.sockstat.%s %d %s%s" % (metric, ts, value, tags)) # If a line in /proc/net/{netstat,snmp} doesn't start with a word in that # dict, we'll ignore it. We use the value to build the metric name. @@ -116,7 +115,7 @@ def print_sockstat(metric, value, tags=""): # Note: tags must start with ' ' "Udp:": "udp", "UdpLite:": "udplite", # We don't collect anything from here for now. "Arista:": "arista", # We don't collect anything from here for now. - } + } # Any stat in /proc/net/{netstat,snmp} that doesn't appear in this dict will # be ignored. If we find a match, we'll use the (metricname, tags). @@ -249,14 +248,13 @@ def print_sockstat(metric, value, tags=""): # Note: tags must start with ' ' }, } - def print_netstat(statstype, metric, value, tags=""): if tags: space = " " else: tags = space = "" - print "net.stat.%s.%s %d %s%s%s" % (statstype, metric, ts, value, - space, tags) + print("net.stat.%s.%s %d %s%s%s" % (statstype, metric, ts, value, + space, tags)) def parse_stats(stats, filename): statsdikt = {} @@ -278,20 +276,19 @@ def parse_stats(stats, filename): assert header[0] == data[0], repr((header, data)) assert len(header) == len(data), repr((header, data)) if header[0] not in known_statstypes: - print >>sys.stderr, ("Unrecoginized line in %s:" - " %r (file=%r)" % (filename, header, stats)) + utils.err("Unrecoginized line in %s: %r (file=%r)" % (filename, header, stats)) continue statstype = header.pop(0) data.pop(0) - stats = dict(zip(header, data)) + stats = dict(list(zip(header, data))) statsdikt.setdefault(known_statstypes[statstype], {}).update(stats) - for statstype, stats in statsdikt.iteritems(): + for statstype, stats in statsdikt.items(): # Undo the kernel's double counting if "ListenDrops" in stats: stats["ListenDrops"] = int(stats["ListenDrops"]) - int(stats.get("ListenOverflows", 0)) elif "RcvbufErrors" in stats: stats["InErrors"] = int(stats.get("InErrors", 0)) - int(stats["RcvbufErrors"]) - for stat, (metric, tags) in known_stats[statstype].iteritems(): + for stat, (metric, tags) in known_stats[statstype].items(): value = stats.get(stat) if value is not None: print_netstat(statstype, metric, value, tags) @@ -306,24 +303,24 @@ def parse_stats(stats, filename): snmpstats = snmp.read() m = re.match(regexp, data) if not m: - print >>sys.stderr, "Cannot parse sockstat: %r" % data + utils.err("Cannot parse sockstat: %r" % data) return 13 # The difference between the first two values is the number of # sockets allocated vs the number of sockets actually in use. - print_sockstat("num_sockets", m.group("tcp_sockets"), " type=tcp") - print_sockstat("num_timewait", m.group("tw_count")) - print_sockstat("sockets_inuse", m.group("tcp_inuse"), " type=tcp") - print_sockstat("sockets_inuse", m.group("udp_inuse"), " type=udp") + print_sockstat("num_sockets", m.group("tcp_sockets"), " type=tcp") + print_sockstat("num_timewait", m.group("tw_count")) + print_sockstat("sockets_inuse", m.group("tcp_inuse"), " type=tcp") + print_sockstat("sockets_inuse", m.group("udp_inuse"), " type=udp") print_sockstat("sockets_inuse", m.group("udplite_inuse"), " type=udplite") - print_sockstat("sockets_inuse", m.group("raw_inuse"), " type=raw") + print_sockstat("sockets_inuse", m.group("raw_inuse"), " type=raw") print_sockstat("num_orphans", m.group("orphans")) print_sockstat("memory", int(m.group("tcp_pages")) * page_size, " type=tcp") if m.group("udp_pages") is not None: - print_sockstat("memory", int(m.group("udp_pages")) * page_size, - " type=udp") + print_sockstat("memory", int(m.group("udp_pages")) * page_size, + " type=udp") print_sockstat("memory", m.group("ip_frag_mem"), " type=ipfrag") print_sockstat("ipfragqueues", m.group("ip_frag_nqueues")) @@ -333,5 +330,6 @@ def parse_stats(stats, filename): sys.stdout.flush() time.sleep(interval) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/nfsstat.py b/collectors/0/nfsstat.py index 1ac12465..6273f8aa 100755 --- a/collectors/0/nfsstat.py +++ b/collectors/0/nfsstat.py @@ -49,8 +49,8 @@ def main(): try: f_nfs = open("/proc/net/rpc/nfs") - except IOError, e: - print >>sys.stderr, "Failed to open input file: %s" % (e,) + except IOError as e: + utils.err("Failed to open input file: %s" % e) return 13 # Ask tcollector to not re-start us immediately. utils.drop_privileges() @@ -59,31 +59,31 @@ def main(): ts = int(time.time()) for line in f_nfs: fields = line.split() - if fields[0] in nfs_client_proc_names.keys(): + if fields[0] in list(nfs_client_proc_names.keys()): # NFSv4 # first entry should equal total count of subsequent entries assert int(fields[1]) == len(fields[2:]), ( - "reported count (%d) does not equal list length (%d)" - % (int(fields[1]), len(fields[2:]))) + "reported count (%d) does not equal list length (%d)" + % (int(fields[1]), len(fields[2:]))) for idx, val in enumerate(fields[2:]): try: - print ("nfs.client.rpc %d %s op=%s version=%s" + print("nfs.client.rpc %d %s op=%s version=%s" % (ts, int(val), nfs_client_proc_names[fields[0]][idx], fields[0][4:])) except IndexError: - print >> sys.stderr, ("Warning: name lookup failed" - " at position %d" % idx) + utils.err("Warning: name lookup failed at position %d" % idx) elif fields[0] == "rpc": # RPC calls = int(fields[1]) retrans = int(fields[2]) authrefrsh = int(fields[3]) - print "nfs.client.rpc.stats %d %d type=calls" % (ts, calls) - print "nfs.client.rpc.stats %d %d type=retrans" % (ts, retrans) - print ("nfs.client.rpc.stats %d %d type=authrefrsh" + print("nfs.client.rpc.stats %d %d type=calls" % (ts, calls)) + print("nfs.client.rpc.stats %d %d type=retrans" % (ts, retrans)) + print("nfs.client.rpc.stats %d %d type=authrefrsh" % (ts, authrefrsh)) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/ntpstat.py b/collectors/0/ntpstat.py index 88a7b11a..3da22f20 100755 --- a/collectors/0/ntpstat.py +++ b/collectors/0/ntpstat.py @@ -17,8 +17,6 @@ # # ntp.offset estimated offset -import os -import socket import subprocess import sys import time @@ -27,19 +25,20 @@ from collectors.lib import utils try: - from collectors.etc import ntpstat_conf + from collectors.etc import ntpstat_conf except ImportError: - ntpstat_conf = None + ntpstat_conf = None + +DEFAULT_COLLECTION_INTERVAL = 60 -DEFAULT_COLLECTION_INTERVAL=60 def main(): """ntpstats main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - if(ntpstat_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + if (ntpstat_conf): config = ntpstat_conf.get_config() - collection_interval=config['collection_interval'] + collection_interval = config['collection_interval'] utils.drop_privileges() @@ -47,10 +46,10 @@ def main(): ts = int(time.time()) try: ntp_proc = subprocess.Popen(["ntpq", "-p"], stdout=subprocess.PIPE) - except OSError, e: + except OSError as e: if e.errno == errno.ENOENT: # looks like ntpdc is not available, stop using this collector - sys.exit(13) # we signal tcollector to stop using this + sys.exit(13) # we signal tcollector to stop using this raise stdout, _ = ntp_proc.communicate() @@ -62,14 +61,15 @@ def main(): if len(fields) <= 0: continue if fields[0].startswith("*"): - offset=fields[8] + offset = fields[8] continue - print ("ntp.offset %d %s" % (ts, offset)) + print("ntp.offset %d %s" % (ts, offset)) else: - print >> sys.stderr, "ntpq -p, returned %r" % (ntp_proc.returncode) + utils.err("ntpq -p, returned %r" % (ntp_proc.returncode)) sys.stdout.flush() time.sleep(collection_interval) + if __name__ == "__main__": main() diff --git a/collectors/0/postgresql.py b/collectors/0/postgresql.py index 07de39dd..c23181c3 100755 --- a/collectors/0/postgresql.py +++ b/collectors/0/postgresql.py @@ -26,122 +26,127 @@ import errno try: - import psycopg2 + import psycopg2 except ImportError: - psycopg2 = None # handled in main() + psycopg2 = None # handled in main() -COLLECTION_INTERVAL = 15 # seconds -CONNECT_TIMEOUT = 2 # seconds +COLLECTION_INTERVAL = 15 # seconds +CONNECT_TIMEOUT = 2 # seconds from collectors.lib import utils from collectors.etc import postgresqlconf # Directories under which to search socket files SEARCH_DIRS = frozenset([ - "/var/run/postgresql", # Debian default - "/var/pgsql_socket", # MacOS default - "/usr/local/var/postgres", # custom compilation - "/tmp", # custom compilation + "/var/run/postgresql", # Debian default + "/var/pgsql_socket", # MacOS default + "/usr/local/var/postgres", # custom compilation + "/tmp", # custom compilation ]) + def find_sockdir(): - """Returns a path to PostgreSQL socket file to monitor.""" - for dir in SEARCH_DIRS: - for dirpath, dirnames, dirfiles in os.walk(dir, followlinks=True): - for name in dirfiles: - # ensure selection of PostgreSQL socket only - if (utils.is_sockfile(os.path.join(dirpath, name)) - and "PGSQL" in name): - return(dirpath) + """Returns a path to PostgreSQL socket file to monitor.""" + for dir in SEARCH_DIRS: + for dirpath, dirnames, dirfiles in os.walk(dir, followlinks=True): + for name in dirfiles: + # ensure selection of PostgreSQL socket only + if (utils.is_sockfile(os.path.join(dirpath, name)) + and "PGSQL" in name): + return (dirpath) + def postgres_connect(sockdir): - """Connects to the PostgreSQL server using the specified socket file.""" - user, password = postgresqlconf.get_user_password() + """Connects to the PostgreSQL server using the specified socket file.""" + user, password = postgresqlconf.get_user_password() + + try: + return psycopg2.connect("host='%s' user='%s' password='%s' " + "connect_timeout='%s' dbname=postgres" + % (sockdir, user, password, + CONNECT_TIMEOUT)) + except (EnvironmentError, EOFError, RuntimeError, socket.error) as e: + utils.err("Couldn't connect to DB :%s" % (e)) - try: - return psycopg2.connect("host='%s' user='%s' password='%s' " - "connect_timeout='%s' dbname=postgres" - % (sockdir, user, password, - CONNECT_TIMEOUT)) - except (EnvironmentError, EOFError, RuntimeError, socket.error), e: - utils.err("Couldn't connect to DB :%s" % (e)) def collect(db): - """ - Collects and prints stats. - - Here we collect only general info, for full list of data for collection - see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html - """ - - try: - cursor = db.cursor() - - # general statics - cursor.execute("SELECT pg_stat_database.*, pg_database_size" - " (pg_database.datname) AS size FROM pg_database JOIN" - " pg_stat_database ON pg_database.datname =" - " pg_stat_database.datname WHERE pg_stat_database.datname" - " NOT IN ('template0', 'template1', 'postgres')") - ts = time.time() - stats = cursor.fetchall() - -# datid | datname | numbackends | xact_commit | xact_rollback | blks_read | blks_hit | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files | temp_bytes | deadlocks | blk_read_time | blk_write_time | stats_reset | size - result = {} - for stat in stats: - database = stat[1] - result[database] = stat - - for database in result: - for i in range(2,len(cursor.description)): - metric = cursor.description[i].name - value = result[database][i] - try: - if metric in ("stats_reset"): - continue - print ("postgresql.%s %i %s database=%s" - % (metric, ts, value, database)) - except: - utils.err("got here") - continue - - # connections - cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity" - " GROUP BY pg_stat_activity.datname") - ts = time.time() - connections = cursor.fetchall() - - for database, connection in connections: - print ("postgresql.connections %i %s database=%s" - % (ts, connection, database)) - - except (EnvironmentError, EOFError, RuntimeError, socket.error), e: - if isinstance(e, IOError) and e[0] == errno.EPIPE: - # exit on a broken pipe. There is no point in continuing - # because no one will read our stdout anyway. - return 2 - utils.err("error: failed to collect data: %s" % e) + """ + Collects and prints stats. + + Here we collect only general info, for full list of data for collection + see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html + """ + + try: + cursor = db.cursor() + + # general statics + cursor.execute("SELECT pg_stat_database.*, pg_database_size" + " (pg_database.datname) AS size FROM pg_database JOIN" + " pg_stat_database ON pg_database.datname =" + " pg_stat_database.datname WHERE pg_stat_database.datname" + " NOT IN ('template0', 'template1', 'postgres')") + ts = time.time() + stats = cursor.fetchall() + + # datid | datname | numbackends | xact_commit | xact_rollback | blks_read | blks_hit | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files | temp_bytes | deadlocks | blk_read_time | blk_write_time | stats_reset | size + result = {} + for stat in stats: + database = stat[1] + result[database] = stat + + for database in result: + for i in range(2, len(cursor.description)): + metric = cursor.description[i].name + value = result[database][i] + try: + if metric in ("stats_reset"): + continue + print("postgresql.%s %i %s database=%s" + % (metric, ts, value, database)) + except: + utils.err("got here") + continue + + # connections + cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity" + " GROUP BY pg_stat_activity.datname") + ts = time.time() + connections = cursor.fetchall() + + for database, connection in connections: + print("postgresql.connections %i %s database=%s" + % (ts, connection, database)) + + except (EnvironmentError, EOFError, RuntimeError, socket.error) as e: + if isinstance(e, IOError) and e[0] == errno.EPIPE: + # exit on a broken pipe. There is no point in continuing + # because no one will read our stdout anyway. + return 2 + utils.err("error: failed to collect data: %s" % e) + def main(args): - """Collects and dumps stats from a PostgreSQL server.""" + """Collects and dumps stats from a PostgreSQL server.""" + + if psycopg2 is None: + utils.err("error: Python module 'psycopg2' is missing") + return 13 # Ask tcollector to not respawn us - if psycopg2 is None: - utils.err("error: Python module 'psycopg2' is missing") - return 13 # Ask tcollector to not respawn us + sockdir = find_sockdir() + if not sockdir: # Nothing to monitor + utils.err("error: Can't find postgresql socket file") + return 13 # Ask tcollector to not respawn us - sockdir = find_sockdir() - if not sockdir: # Nothing to monitor - utils.err("error: Can't find postgresql socket file") - return 13 # Ask tcollector to not respawn us + db = postgres_connect(sockdir) + db.autocommit = True - db = postgres_connect(sockdir) - db.autocommit=True + while True: + collect(db) + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) - while True: - collect(db) - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main(sys.argv)) + sys.stdin.close() + sys.exit(main(sys.argv)) diff --git a/collectors/0/procnettcp.py b/collectors/0/procnettcp.py index 1e0f381a..42778d14 100755 --- a/collectors/0/procnettcp.py +++ b/collectors/0/procnettcp.py @@ -55,7 +55,6 @@ from collectors.lib import utils - USERS = ("root", "www-data", "mysql") # Note if a service runs on multiple ports and you @@ -101,9 +100,9 @@ 11226: "memcache", 50020: "datanode", 60020: "hregionserver", - } +} -SERVICES = tuple(set(PORTS.itervalues())) +SERVICES = tuple(set(PORTS.values())) TCPSTATES = { "01": "established", @@ -117,7 +116,7 @@ "09": "last_ack", "0A": "listen", "0B": "closing", - } +} def is_public_ip(ipstr): @@ -143,10 +142,10 @@ def is_public_ip(ipstr): def main(unused_args): """procnettcp main loop""" - try: # On some Linux kernel versions, with lots of connections - os.nice(19) # this collector can be very CPU intensive. So be nicer. - except OSError, e: - print >>sys.stderr, "warning: failed to self-renice:", e + try: # On some Linux kernel versions, with lots of connections + os.nice(19) # this collector can be very CPU intensive. So be nicer. + except OSError as e: + utils.err("warning: failed to self-renice:", e) interval = 60 @@ -165,13 +164,13 @@ def main(unused_args): # address size try: tcp6 = open("/proc/net/tcp6") - except IOError, (errno, msg): - if errno == 2: # No such file => IPv6 is disabled. + except IOError as io_err: + if io_err.errno == 2: # No such file => IPv6 is disabled. tcp6 = None else: raise - except IOError, e: - print >>sys.stderr, "Failed to open input file: %s" % (e,) + except IOError as e: + utils.err("Failed to open input file: %s" % e) return 13 # Ask tcollector to not re-start us immediately. utils.drop_privileges() @@ -206,7 +205,6 @@ def main(unused_args): else: endpoint = "internal" - user = uids.get(uid, "other") key = "state=" + TCPSTATES[state] + " endpoint=" + endpoint + \ @@ -224,12 +222,13 @@ def main(unused_args): key = ("state=%s endpoint=%s service=%s user=%s" % (TCPSTATES[state], endpoint, service, user)) if key in counter: - print "proc.net.tcp", ts, counter[key], key + print("proc.net.tcp", ts, counter[key], key) else: - print "proc.net.tcp", ts, "0", key + print("proc.net.tcp", ts, "0", key) sys.stdout.flush() time.sleep(interval) + if __name__ == "__main__": sys.exit(main(sys.argv)) diff --git a/collectors/0/procstats.py b/collectors/0/procstats.py index 502fa015..120baeab 100755 --- a/collectors/0/procstats.py +++ b/collectors/0/procstats.py @@ -29,13 +29,14 @@ METRIC_MAPPING = yaml_conf.load_collector_configuration('node_metrics.yml') + def find_sysfs_numa_stats(): """Returns a possibly empty list of NUMA stat file names.""" try: nodes = os.listdir(NUMADIR) - except OSError, (errno, msg): - if errno == 2: # No such file or directory - return [] # We don't have NUMA stats. + except OSError as os_err: + if os_err.errno == 2: # No such file or directory + return [] # We don't have NUMA stats. raise nodes = [node for node in nodes if node.startswith("node")] @@ -43,8 +44,8 @@ def find_sysfs_numa_stats(): for node in nodes: try: numastats.append(os.path.join(NUMADIR, node, "numastat")) - except OSError, (errno, msg): - if errno == 2: # No such file or directory + except OSError as os_err: + if os_err.errno == 2: # No such file or directory continue raise return numastats @@ -54,31 +55,31 @@ def print_numa_stats(numafiles): """From a list of files names, opens file, extracts and prints NUMA stats.""" for numafilename in numafiles: numafile = open(numafilename) - node_id = int(numafile.name[numafile.name.find("/node/node")+10:-9]) + node_id = int(numafile.name[numafile.name.find("/node/node") + 10:-9]) ts = int(time.time()) stats = dict(line.split() for line in numafile.read().splitlines()) - for stat, tag in (# hit: process wanted memory from this node and got it - ("numa_hit", "hit"), - # miss: process wanted another node and got it from - # this one instead. - ("numa_miss", "miss")): - print ("sys.numa.zoneallocs %d %s node=%d type=%s" + for stat, tag in ( # hit: process wanted memory from this node and got it + ("numa_hit", "hit"), + # miss: process wanted another node and got it from + # this one instead. + ("numa_miss", "miss")): + print("sys.numa.zoneallocs %d %s node=%d type=%s" % (ts, stats[stat], node_id, tag)) # Count this one as a separate metric because we can't sum up hit + # miss + foreign, this would result in double-counting of all misses. # See `zone_statistics' in the code of the kernel. # foreign: process wanted memory from this node but got it from # another node. So maybe this node is out of free pages. - print ("sys.numa.foreign_allocs %d %s node=%d" + print("sys.numa.foreign_allocs %d %s node=%d" % (ts, stats["numa_foreign"], node_id)) # When is memory allocated to a node that's local or remote to where # the process is running. for stat, tag in (("local_node", "local"), ("other_node", "remote")): - print ("sys.numa.allocation %d %s node=%d type=%s" + print("sys.numa.allocation %d %s node=%d type=%s" % (ts, stats[stat], node_id, tag)) # Pages successfully allocated with the interleave policy. - print ("sys.numa.interleave %d %s node=%d type=hit" + print("sys.numa.interleave %d %s node=%d type=hit" % (ts, stats["interleave_hit"], node_id)) numafile.close() @@ -95,19 +96,19 @@ def main(): f_interrupts = open("/proc/interrupts", "r") f_scaling = "/sys/devices/system/cpu/cpu%s/cpufreq/%s_freq" - f_scaling_min = dict([]) - f_scaling_max = dict([]) - f_scaling_cur = dict([]) + f_scaling_min = dict([]) + f_scaling_max = dict([]) + f_scaling_cur = dict([]) f_softirqs = open("/proc/softirqs", "r") for cpu in glob.glob("/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_cur_freq"): m = re.match("/sys/devices/system/cpu/cpu([0-9]*)/cpufreq/scaling_cur_freq", cpu) if not m: continue cpu_no = m.group(1) - sys.stderr.write(f_scaling % (cpu_no,"min")) - f_scaling_min[cpu_no] = open(f_scaling % (cpu_no,"cpuinfo_min"), "r") - f_scaling_max[cpu_no] = open(f_scaling % (cpu_no,"cpuinfo_max"), "r") - f_scaling_cur[cpu_no] = open(f_scaling % (cpu_no,"scaling_cur"), "r") + sys.stderr.write(f_scaling % (cpu_no, "min")) + f_scaling_min[cpu_no] = open(f_scaling % (cpu_no, "cpuinfo_min"), "r") + f_scaling_max[cpu_no] = open(f_scaling % (cpu_no, "cpuinfo_max"), "r") + f_scaling_cur[cpu_no] = open(f_scaling % (cpu_no, "scaling_cur"), "r") numastats = find_sysfs_numa_stats() utils.drop_privileges() @@ -119,9 +120,9 @@ def main(): for line in f_uptime: m = re.match("(\S+)\s+(\S+)", line) if m: - print "proc.uptime.total %d %s" % (ts, m.group(1)) + print("proc.uptime.total %d %s" % (ts, m.group(1))) metric_naming.print_if_apptuit_standard_metric("proc.uptime.total", METRIC_MAPPING, ts, m.group(1)) - print "proc.uptime.now %d %s" % (ts, m.group(2)) + print("proc.uptime.now %d %s" % (ts, m.group(2))) # proc.meminfo f_meminfo.seek(0) @@ -135,8 +136,7 @@ def main(): else: value = m.group(2) name = re.sub("\W", "_", m.group(1)).lower().strip("_") - print ("proc.meminfo.%s %d %s" - % (name, ts, value)) + print("proc.meminfo.%s %d %s" % (name, ts, value)) metric_naming.print_if_apptuit_standard_metric("proc.meminfo." + name, METRIC_MAPPING, ts, value) # proc.vmstat @@ -148,8 +148,8 @@ def main(): continue if m.group(1) in ("pgpgin", "pgpgout", "pswpin", "pswpout", "pgfault", "pgmajfault"): - print "proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2)) - metric_naming.print_if_apptuit_standard_metric("proc.vmstat."+m.group(1), METRIC_MAPPING, ts, + print("proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2))) + metric_naming.print_if_apptuit_standard_metric("proc.vmstat." + m.group(1), METRIC_MAPPING, ts, m.group(2)) # proc.stat @@ -169,27 +169,26 @@ def main(): tags = '' fields = m.group(2).split() cpu_types = ['user', 'nice', 'system', 'idle', 'iowait', - 'irq', 'softirq', 'guest', 'guest_nice'] + 'irq', 'softirq', 'guest', 'guest_nice'] # We use zip to ignore fields that don't exist. for value, field_name in zip(fields, cpu_types): - print "proc.stat.cpu%s %d %s type=%s%s" % (metric_percpu, - ts, value, field_name, tags) - metric_naming.print_if_apptuit_standard_metric("proc.stat.cpu"+metric_percpu, METRIC_MAPPING, ts, + print("proc.stat.cpu%s %d %s type=%s%s" % (metric_percpu, + ts, value, field_name, tags)) + metric_naming.print_if_apptuit_standard_metric("proc.stat.cpu" + metric_percpu, METRIC_MAPPING, ts, value, tags={"type": field_name}, tags_str=tags) elif m.group(1) == "intr": intr = m.group(2).split()[0] - print ("proc.stat.intr %d %s" - % (ts, intr)) + print("proc.stat.intr %d %s" % (ts, intr)) metric_naming.print_if_apptuit_standard_metric("proc.stat.intr", METRIC_MAPPING, ts, intr) elif m.group(1) == "ctxt": - print "proc.stat.ctxt %d %s" % (ts, m.group(2)) + print("proc.stat.ctxt %d %s" % (ts, m.group(2))) metric_naming.print_if_apptuit_standard_metric("proc.stat.ctxt", METRIC_MAPPING, ts, m.group(2)) elif m.group(1) == "processes": - print "proc.stat.processes %d %s" % (ts, m.group(2)) + print("proc.stat.processes %d %s" % (ts, m.group(2))) metric_naming.print_if_apptuit_standard_metric("proc.stat.processes", METRIC_MAPPING, ts, m.group(2)) elif m.group(1) == "procs_blocked": - print "proc.stat.procs_blocked %d %s" % (ts, m.group(2)) + print("proc.stat.procs_blocked %d %s" % (ts, m.group(2))) f_loadavg.seek(0) ts = int(time.time()) @@ -197,19 +196,19 @@ def main(): m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\d+)/(\d+)\s+", line) if not m: continue - print "proc.loadavg.1min %d %s" % (ts, m.group(1)) + print("proc.loadavg.1min %d %s" % (ts, m.group(1))) metric_naming.print_if_apptuit_standard_metric("proc.loadavg.1min", METRIC_MAPPING, ts, m.group(1)) - print "proc.loadavg.5min %d %s" % (ts, m.group(2)) + print("proc.loadavg.5min %d %s" % (ts, m.group(2))) metric_naming.print_if_apptuit_standard_metric("proc.loadavg.5min", METRIC_MAPPING, ts, m.group(2)) - print "proc.loadavg.15min %d %s" % (ts, m.group(3)) + print("proc.loadavg.15min %d %s" % (ts, m.group(3))) metric_naming.print_if_apptuit_standard_metric("proc.loadavg.15min", METRIC_MAPPING, ts, m.group(3)) - print "proc.loadavg.runnable %d %s" % (ts, m.group(4)) - print "proc.loadavg.total_threads %d %s" % (ts, m.group(5)) + print("proc.loadavg.runnable %d %s" % (ts, m.group(4))) + print("proc.loadavg.total_threads %d %s" % (ts, m.group(5))) f_entropy_avail.seek(0) ts = int(time.time()) for line in f_entropy_avail: - print "proc.kernel.entropy_avail %d %s" % (ts, line.strip()) + print("proc.kernel.entropy_avail %d %s" % (ts, line.strip())) f_interrupts.seek(0) ts = int(time.time()) @@ -235,7 +234,7 @@ def main(): sys.stderr.write("Unexpected interrupts value %r in" " %r: " % (val, cols)) break - print ("proc.interrupts %s %s type=%s cpu=%s" + print("proc.interrupts %s %s type=%s cpu=%s" % (ts, val, irq_type, i)) f_softirqs.seek(0) @@ -256,34 +255,34 @@ def main(): sys.stderr.write("Unexpected softirq value %r in" " %r: " % (val, cols)) break - print ("proc.softirqs %s %s type=%s cpu=%s" + print("proc.softirqs %s %s type=%s cpu=%s" % (ts, val, irq_type, i)) print_numa_stats(numastats) # Print scaling stats ts = int(time.time()) - for cpu_no in f_scaling_min.keys(): + for cpu_no in list(f_scaling_min.keys()): f = f_scaling_min[cpu_no] f.seek(0) for line in f: - print "proc.scaling.min %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no) + print("proc.scaling.min %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no)) ts = int(time.time()) - for cpu_no in f_scaling_max.keys(): + for cpu_no in list(f_scaling_max.keys()): f = f_scaling_max[cpu_no] f.seek(0) for line in f: - print "proc.scaling.max %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no) + print("proc.scaling.max %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no)) ts = int(time.time()) - for cpu_no in f_scaling_cur.keys(): + for cpu_no in list(f_scaling_cur.keys()): f = f_scaling_cur[cpu_no] f.seek(0) for line in f: - print "proc.scaling.cur %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no) + print("proc.scaling.cur %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no)) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": main() - diff --git a/collectors/0/pxc-collector.py b/collectors/0/pxc-collector.py index 09d269fa..88be6043 100755 --- a/collectors/0/pxc-collector.py +++ b/collectors/0/pxc-collector.py @@ -29,59 +29,65 @@ from collectors.etc import pxcconf from collectors.lib import utils -__author__ = "Kai Laufer" -__version__ = "1.0.1" -__email__ = "mail@kai-laufer.de" +__author__ = "Kai Laufer" +__version__ = "1.0.1" +__email__ = "mail@kai-laufer.de" """ You can find these functions and additional information in etc/pxcconf.py """ -prefix = pxcconf.getPrefix() or "pxc" # Prefix for the collector, e.g. pxc -> pxc.wsrep_replicated_bytes -interval = pxcconf.getInterval() or 1 # Interval for checking MySQL statistics -galeraFile = pxcconf.getGaleraFile() or "/usr/lib/libgalera_smm.so" # Path to a galera specific file for ensuring that check won't run with a usual MySQL server. Default: "/usr/lib/libgalera_smm.so" -login = pxcconf.getUserPassword() # MySQL-User, MySQL-Password and MySQL-Host (localhost) -myMap = pxcconf.getKeyMap() or ( "wsrep_last_committed", "wsrep_replicated", "wsrep_repl_keys", "wsrep_local_commits" ) # Status variables which should be read -mysqlUser = login[0] or "root" +prefix = pxcconf.getPrefix() or "pxc" # Prefix for the collector, e.g. pxc -> pxc.wsrep_replicated_bytes +interval = pxcconf.getInterval() or 1 # Interval for checking MySQL statistics +galeraFile = pxcconf.getGaleraFile() or "/usr/lib/libgalera_smm.so" # Path to a galera specific file for ensuring that check won't run with a usual MySQL server. Default: "/usr/lib/libgalera_smm.so" +login = pxcconf.getUserPassword() # MySQL-User, MySQL-Password and MySQL-Host (localhost) +myMap = pxcconf.getKeyMap() or ("wsrep_last_committed", "wsrep_replicated", "wsrep_repl_keys", + "wsrep_local_commits") # Status variables which should be read +mysqlUser = login[0] or "root" mysqlPasswd = login[1] or "" -mysqlHost = login[2] or "localhost" +mysqlHost = login[2] or "localhost" + def getRow(): - """ Test connection """ - try: - db = mysql.connect(host=mysqlHost, user=mysqlUser, passwd=mysqlPasswd) - cursor = db.cursor() - cursor.execute("SHOW STATUS LIKE '%wsrep%'") - result = cursor.fetchall() + """ Test connection """ + try: + db = mysql.connect(host=mysqlHost, user=mysqlUser, passwd=mysqlPasswd) + cursor = db.cursor() + cursor.execute("SHOW STATUS LIKE '%wsrep%'") + result = cursor.fetchall() + + except: + print("Error: unable to fetch data - Check your configuration!") + sys.exit(13) # Don't respawn collector - except: - print "Error: unable to fetch data - Check your configuration!" - sys.exit(13) # Don't respawn collector + db.close() + return result - db.close() - return result class TSDResult(object): - """ Create TSD output """ - def __init__(self, key, value, prefix, timestamp): - self.key = key - self.value = value - self.prefix = prefix - self.timestamp = timestamp + """ Create TSD output """ + + def __init__(self, key, value, prefix, timestamp): + self.key = key + self.value = value + self.prefix = prefix + self.timestamp = timestamp + + def TSDRow(self): + return "%s.%s %s %s" % (self.prefix, self.key, self.timestamp, self.value) - def TSDRow(self): - return "%s.%s %s %s" % (self.prefix, self.key, self.timestamp, self.value) def main(): - if os.path.isfile(galeraFile) is True: - while True: - rows = getRow() - for row in rows: - timestamp = int(time.time()) - if row[0] in myMap: - result = TSDResult(row[0], row[1], prefix, timestamp) - print result.TSDRow() - time.sleep(interval) - return 0 - else: - return 2 + if os.path.isfile(galeraFile) is True: + while True: + rows = getRow() + for row in rows: + timestamp = int(time.time()) + if row[0] in myMap: + result = TSDResult(row[0], row[1], prefix, timestamp) + print(result.TSDRow()) + time.sleep(interval) + return 0 + else: + return 2 + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/redis_stats.py b/collectors/0/redis_stats.py index 590b6fa9..c26918af 100755 --- a/collectors/0/redis_stats.py +++ b/collectors/0/redis_stats.py @@ -61,6 +61,7 @@ import subprocess import sys import time + from collectors.lib import utils from collectors.etc import redis_stats_conf @@ -112,7 +113,7 @@ def main(): def print_stat(metric, value, tags=""): if value is not None: - print "redis.%s %d %s %s" % (metric, ts, value, tags) + print("redis.%s %d %s %s" % (metric, ts, value, tags)) dbre = re.compile("^db\d+$") @@ -137,8 +138,8 @@ def print_stat(metric, value, tags=""): print_stat(key, info[key], tags) # per database metrics - for db in filter(dbre.match, info.keys()): - for db_metric in info[db].keys(): + for db in filter(dbre.match, list(info.keys())): + for db_metric in list(info[db].keys()): print_stat(db_metric, info[db][db_metric], "%s db=%s" % (tags, db)) # get some instant latency information @@ -163,7 +164,7 @@ def scan_for_instances(): ns_proc = subprocess.Popen(["netstat", "-tnlp"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, _ = ns_proc.communicate() if ns_proc.returncode != 0: - print >> sys.stderr, "failed to find instances %r" % ns_proc.returncode + utils.err("failed to find instances %r" % ns_proc.returncode) return {} for line in stdout.split("\n"): diff --git a/collectors/0/riak.py b/collectors/0/riak.py index d2016b5e..e31814ca 100755 --- a/collectors/0/riak.py +++ b/collectors/0/riak.py @@ -44,7 +44,7 @@ import os import sys import time -import urllib2 +import urllib.request from collectors.lib import utils @@ -69,7 +69,7 @@ 'executing_mappers': ('executing_mappers', ''), 'mem_allocated': ('memory.allocated', ''), 'mem_total': ('memory.total', ''), - #connected_nodes is calculated + # connected_nodes is calculated } @@ -87,12 +87,12 @@ def main(): def print_stat(metric, value, tags=""): if value is not None: - print "riak.%s %d %s %s" % (metric, ts, value, tags) + print("riak.%s %d %s %s" % (metric, ts, value, tags)) while True: ts = int(time.time()) - req = urllib2.urlopen("http://localhost:8098/stats") + req = urllib.request.urlopen("http://localhost:8098/stats") if req is not None: obj = json.loads(req.read()) for key in obj: diff --git a/collectors/0/smart_stats.py b/collectors/0/smart_stats.py index 735aec51..90666b9b 100755 --- a/collectors/0/smart_stats.py +++ b/collectors/0/smart_stats.py @@ -23,13 +23,12 @@ import time from collectors.lib import utils - try: from collectors.etc import smart_stats_conf except ImportError: smart_stats_conf = None -DEFAULT_COLLECTION_INTERVAL=120 +DEFAULT_COLLECTION_INTERVAL = 120 TWCLI = "/usr/sbin/tw_cli" ARCCONF = "/usr/local/bin/arcconf" @@ -43,199 +42,199 @@ # Common smart attributes, add more to this list if you start seeing # numbers instead of attribute names in TSD results. ATTRIBUTE_MAP = { - "1": "raw_read_error_rate", - "2": "throughput_performance", - "3": "spin_up_time", - "4": "start_stop_count", - "5": "reallocated_sector_ct", - "7": "seek_error_rate", - "8": "seek_time_performance", - "9": "power_on_hours", - "10": "spin_retry_count", - "11": "recalibration_retries", - "12": "power_cycle_count", - "13": "soft_read_error_rate", - "175": "program_fail_count_chip", - "176": "erase_fail_count_chip", - "177": "wear_leveling_count", - "178": "used_rsvd_blk_cnt_chip", - "179": "used_rsvd_blk_cnt_tot", - "180": "unused_rsvd_blk_cnt_tot", - "181": "program_fail_cnt_total", - "182": "erase_fail_count_total", - "183": "runtime_bad_block", - "184": "end_to_end_error", - "187": "reported_uncorrect", - "188": "command_timeout", - "189": "high_fly_writes", - "190": "airflow_temperature_celsius", - "191": "g_sense_error_rate", - "192": "power-off_retract_count", - "193": "load_cycle_count", - "194": "temperature_celsius", - "195": "hardware_ecc_recovered", - "196": "reallocated_event_count", - "197": "current_pending_sector", - "198": "offline_uncorrectable", - "199": "udma_crc_error_count", - "200": "write_error_rate", - "233": "media_wearout_indicator", - "240": "transfer_error_rate", - "241": "total_lba_writes", - "242": "total_lba_read", - } + "1": "raw_read_error_rate", + "2": "throughput_performance", + "3": "spin_up_time", + "4": "start_stop_count", + "5": "reallocated_sector_ct", + "7": "seek_error_rate", + "8": "seek_time_performance", + "9": "power_on_hours", + "10": "spin_retry_count", + "11": "recalibration_retries", + "12": "power_cycle_count", + "13": "soft_read_error_rate", + "175": "program_fail_count_chip", + "176": "erase_fail_count_chip", + "177": "wear_leveling_count", + "178": "used_rsvd_blk_cnt_chip", + "179": "used_rsvd_blk_cnt_tot", + "180": "unused_rsvd_blk_cnt_tot", + "181": "program_fail_cnt_total", + "182": "erase_fail_count_total", + "183": "runtime_bad_block", + "184": "end_to_end_error", + "187": "reported_uncorrect", + "188": "command_timeout", + "189": "high_fly_writes", + "190": "airflow_temperature_celsius", + "191": "g_sense_error_rate", + "192": "power-off_retract_count", + "193": "load_cycle_count", + "194": "temperature_celsius", + "195": "hardware_ecc_recovered", + "196": "reallocated_event_count", + "197": "current_pending_sector", + "198": "offline_uncorrectable", + "199": "udma_crc_error_count", + "200": "write_error_rate", + "233": "media_wearout_indicator", + "240": "transfer_error_rate", + "241": "total_lba_writes", + "242": "total_lba_read", +} class Alarm(RuntimeError): - pass + pass def alarm_handler(signum, frame): - print >>sys.stderr, ("Program took too long to run, " - "consider increasing its timeout.") - raise Alarm() + utils.err("Program took too long to run, consider increasing its timeout.") + raise Alarm() def smart_is_broken(drives): - """Determines whether SMART can be used. + """Determines whether SMART can be used. - Args: - drives: A list of device names on which we intend to use SMART. + Args: + drives: A list of device names on which we intend to use SMART. - Returns: - True if SMART is available, False otherwise. - """ - if os.path.exists(ARCCONF): - return is_adaptec_driver_broken() - if os.path.exists(TWCLI): - return is_3ware_driver_broken(drives) - return False + Returns: + True if SMART is available, False otherwise. + """ + if os.path.exists(ARCCONF): + return is_adaptec_driver_broken() + if os.path.exists(TWCLI): + return is_3ware_driver_broken(drives) + return False def is_adaptec_driver_broken(): - signal.alarm(COMMAND_TIMEOUT) - arcconf = subprocess.Popen("%s %s" % (ARCCONF, ARCCONF_ARGS), - shell=True, - stdout=subprocess.PIPE) - arcconf_output = arcconf.communicate()[0] - signal.alarm(0) - if arcconf.returncode != 0: - if arcconf_output and arcconf_output.startswith(NO_CONTROLLER): - # No controller => no problem. - return False - if arcconf.returncode == 127: - # arcconf doesn't even work on this system, so assume we're safe - return False - print >>sys.stderr, ("arcconf unexpected error %s" % arcconf.returncode) - return True - for line in arcconf_output.split("\n"): - fields = [x for x in line.split(" ") if x] - if fields[0] == "Driver" and fields[2] in BROKEN_DRIVER_VERSIONS: - print >>sys.stderr, ("arcconf indicates broken driver version %s" - % fields[2]) - return True - return False - -def is_3ware_driver_broken(drives): - # Apparently 3ware controllers can't report SMART stats from SAS drives. WTF. - # See also http://sourceforge.net/apps/trac/smartmontools/ticket/161 - for i in reversed(xrange(len(drives))): - drive = drives[i] signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i /dev/" + drive, - shell=True, stdout=subprocess.PIPE) - smart_output = smart_ctl.communicate()[0] - if "supports SMART and is Disabled" in smart_output: - print >>sys.stderr, "SMART is disabled for %s" % drive - del drives[i] # We're iterating from the end of the list so this is OK. + arcconf = subprocess.Popen("%s %s" % (ARCCONF, ARCCONF_ARGS), + shell=True, + stdout=subprocess.PIPE) + arcconf_output = arcconf.communicate()[0] signal.alarm(0) - if not drives: - print >>sys.stderr, "None of the drives support SMART. Are they SAS drives?" - return True - return False + if arcconf.returncode != 0: + if arcconf_output and arcconf_output.startswith(NO_CONTROLLER): + # No controller => no problem. + return False + if arcconf.returncode == 127: + # arcconf doesn't even work on this system, so assume we're safe + return False + utils.err("arcconf unexpected error %s" % arcconf.returncode) + return True + for line in arcconf_output.split("\n"): + fields = [x for x in line.split(" ") if x] + if fields[0] == "Driver" and fields[2] in BROKEN_DRIVER_VERSIONS: + utils.err("arcconf indicates broken driver version %s" % fields[2]) + return True + return False + + +def is_3ware_driver_broken(drives): + # Apparently 3ware controllers can't report SMART stats from SAS drives. WTF. + # See also http://sourceforge.net/apps/trac/smartmontools/ticket/161 + for i in reversed(range(len(drives))): + drive = drives[i] + signal.alarm(COMMAND_TIMEOUT) + smart_ctl = subprocess.Popen(SMART_CTL + " -i /dev/" + drive, + shell=True, stdout=subprocess.PIPE) + smart_output = smart_ctl.communicate()[0] + if "supports SMART and is Disabled" in smart_output: + utils.err("SMART is disabled for %s" % drive) + del drives[i] # We're iterating from the end of the list so this is OK. + signal.alarm(0) + if not drives: + utils.err("None of the drives support SMART. Are they SAS drives?") + return True + return False def process_output(drive, smart_output): - """Print formatted SMART output for the drive""" - ts = int(time.time()) - smart_output = smart_output.split("\n") - # Set data_marker to 0, so we skip stuff until we see a line - # beginning with ID# in the output. Start processing rows after - # that point. - data_marker = False - is_seagate = False - - for line in smart_output: - if data_marker: - fields = line.split() - if len(fields) < 2: - continue - field = fields[0] - if len(fields) > 2 and field in ATTRIBUTE_MAP: - metric = ATTRIBUTE_MAP[field] - value = fields[9].split()[0] - print ("smart.%s %d %s disk=%s" % (metric, ts, value, drive)) - if is_seagate and metric in ("seek_error_rate", "raw_read_error_rate"): - # It appears that some Seagate drives (and possibly some Western - # Digital ones too) use the first 16 bits to store error counts, - # and the low 32 bits to store operation counts, out of these 48 - # bit values. So try to be helpful and extract these here. - value = int(value) - print ("smart.%s %d %d disk=%s" - % (metric.replace("error_rate", "count"), ts, - value & 0xFFFFFFFF, drive)) - print ("smart.%s %d %d disk=%s" - % (metric.replace("error_rate", "errors"), ts, - (value & 0xFFFF00000000) >> 32, drive)) - elif line.startswith("ID#"): - data_marker = True - elif line.startswith("Device Model:"): - model = line.split(None, 2)[2] - # Rough approximation to detect Seagate drives. - is_seagate = model.startswith("ST") + """Print formatted SMART output for the drive""" + ts = int(time.time()) + smart_output = smart_output.split("\n") + # Set data_marker to 0, so we skip stuff until we see a line + # beginning with ID# in the output. Start processing rows after + # that point. + data_marker = False + is_seagate = False + + for line in smart_output: + if data_marker: + fields = line.split() + if len(fields) < 2: + continue + field = fields[0] + if len(fields) > 2 and field in ATTRIBUTE_MAP: + metric = ATTRIBUTE_MAP[field] + value = fields[9].split()[0] + print("smart.%s %d %s disk=%s" % (metric, ts, value, drive)) + if is_seagate and metric in ("seek_error_rate", "raw_read_error_rate"): + # It appears that some Seagate drives (and possibly some Western + # Digital ones too) use the first 16 bits to store error counts, + # and the low 32 bits to store operation counts, out of these 48 + # bit values. So try to be helpful and extract these here. + value = int(value) + print("smart.%s %d %d disk=%s" + % (metric.replace("error_rate", "count"), ts, + value & 0xFFFFFFFF, drive)) + print("smart.%s %d %d disk=%s" + % (metric.replace("error_rate", "errors"), ts, + (value & 0xFFFF00000000) >> 32, drive)) + elif line.startswith("ID#"): + data_marker = True + elif line.startswith("Device Model:"): + model = line.split(None, 2)[2] + # Rough approximation to detect Seagate drives. + is_seagate = model.startswith("ST") def main(): - """main loop for SMART collector""" - - collection_interval=DEFAULT_COLLECTION_INTERVAL - if(smart_stats_conf): - config = smart_stats_conf.get_config() - collection_interval=config['collection_interval'] - - # Get the list of block devices. - drives = [dev[5:] for dev in glob.glob("/dev/[hs]d[a-z]")] - # Try FreeBSD drives if no block devices found - if not drives: - drives = [dev[5:] for dev in glob.glob("/dev/da[0-9]")+glob.glob("/dev/da[0-9][0-9]")+glob.glob("/dev/ada[0-9]")+glob.glob("/dev/ada[0-9][0-9]")] - # Exit gracefully if no block devices found - if not drives: - sys.exit(13) - - - # to make sure we are done with smartctl in COMMAND_TIMEOUT seconds - signal.signal(signal.SIGALRM, alarm_handler) - - if smart_is_broken(drives): - sys.exit(13) - - while True: - for drive in drives: - signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i -A /dev/" + drive, - shell=True, stdout=subprocess.PIPE) - smart_output = smart_ctl.communicate()[0] - signal.alarm(0) - if smart_ctl.returncode != 0: - if smart_ctl.returncode == 127: - sys.exit(13) - else: - print >>sys.stderr, "Command exited with: %d" % smart_ctl.returncode - process_output(drive, smart_output) - - sys.stdout.flush() - time.sleep(collection_interval) + """main loop for SMART collector""" + + collection_interval = DEFAULT_COLLECTION_INTERVAL + if (smart_stats_conf): + config = smart_stats_conf.get_config() + collection_interval = config['collection_interval'] + + # Get the list of block devices. + drives = [dev[5:] for dev in glob.glob("/dev/[hs]d[a-z]")] + # Try FreeBSD drives if no block devices found + if not drives: + drives = [dev[5:] for dev in + glob.glob("/dev/da[0-9]") + glob.glob("/dev/da[0-9][0-9]") + glob.glob("/dev/ada[0-9]") + glob.glob( + "/dev/ada[0-9][0-9]")] + # Exit gracefully if no block devices found + if not drives: + sys.exit(13) + + # to make sure we are done with smartctl in COMMAND_TIMEOUT seconds + signal.signal(signal.SIGALRM, alarm_handler) + + if smart_is_broken(drives): + sys.exit(13) + + while True: + for drive in drives: + signal.alarm(COMMAND_TIMEOUT) + smart_ctl = subprocess.Popen(SMART_CTL + " -i -A /dev/" + drive, + shell=True, stdout=subprocess.PIPE) + smart_output = smart_ctl.communicate()[0] + signal.alarm(0) + if smart_ctl.returncode != 0: + if smart_ctl.returncode == 127: + sys.exit(13) + else: + utils.err("Command exited with: %d" % smart_ctl.returncode) + process_output(drive, smart_output) + + sys.stdout.flush() + time.sleep(collection_interval) if __name__ == "__main__": - main() + main() diff --git a/collectors/0/sysload.py b/collectors/0/sysload.py index c10d3d73..0b2812fc 100755 --- a/collectors/0/sysload.py +++ b/collectors/0/sysload.py @@ -52,39 +52,44 @@ except ImportError: sysload_conf = None -DEFAULT_COLLECTION_INTERVAL=15 +DEFAULT_COLLECTION_INTERVAL = 15 + def convert_to_bytes(string): """Take a string in the form 1234K, and convert to bytes""" factors = { - "K": 1024, - "M": 1024 * 1024, - "G": 1024 * 1024 * 1024, - "T": 1024 * 1024 * 1024 * 1024, - "P": 1024 * 1024 * 1024 * 1024 * 1024, - "E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, + "K": 1024, + "M": 1024 * 1024, + "G": 1024 * 1024 * 1024, + "T": 1024 * 1024 * 1024 * 1024, + "P": 1024 * 1024 * 1024 * 1024 * 1024, + "E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, } for f, fm in factors.items(): if string.endswith(f): number = float(string[:-1]) number = number * fm - return long(number) - return long(string) + return int(number) + return int(string) + signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """top main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - collect_every_cpu=True - if(sysload_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + collect_every_cpu = True + if (sysload_conf): config = sysload_conf.get_config() - collection_interval=config['collection_interval'] - collect_every_cpu=config['collect_every_cpu'] + collection_interval = config['collection_interval'] + collect_every_cpu = config['collect_every_cpu'] global signal_received @@ -93,18 +98,18 @@ def main(): try: if platform.system() == "FreeBSD": - if(collect_every_cpu): + if (collect_every_cpu): p_top = subprocess.Popen( - ["top", "-S", "-P", "-n", "-s"+str(collection_interval), "-dinfinity", "0"], + ["top", "-S", "-P", "-n", "-s" + str(collection_interval), "-dinfinity", "0"], stdout=subprocess.PIPE, ) else: p_top = subprocess.Popen( - ["top", "-S", "-n", "-s"+str(collection_interval), "-dinfinity", "0"], + ["top", "-S", "-n", "-s" + str(collection_interval), "-dinfinity", "0"], stdout=subprocess.PIPE, - ) + ) else: - if(collect_every_cpu): + if (collect_every_cpu): p_top = subprocess.Popen( ["mpstat", "-P", "ALL", str(collection_interval)], stdout=subprocess.PIPE, @@ -114,10 +119,10 @@ def main(): ["mpstat", str(collection_interval)], stdout=subprocess.PIPE, ) - except OSError, e: + except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise timestamp = 0 @@ -125,7 +130,7 @@ def main(): while signal_received is None: try: line = p_top.stdout.readline() - except (IOError, OSError), e: + except (IOError, OSError) as e: if e.errno in (errno.EINTR, errno.EAGAIN): break raise @@ -144,139 +149,141 @@ def main(): if len(fields) <= 0: continue - if (((fields[0] == "CPU") or (re.match("[0-9][0-9]:[0-9][0-9]:[0-9][0-9]",fields[0]))) and ((collect_every_cpu and re.match("[0-9]+:?",fields[1])) or ((not collect_every_cpu) and re.match("all:?",fields[1])))): - if((fields[1] == "all") or (fields[1] == "0")): + if (((fields[0] == "CPU") or (re.match("[0-9][0-9]:[0-9][0-9]:[0-9][0-9]", fields[0]))) and ( + (collect_every_cpu and re.match("[0-9]+:?", fields[1])) or ( + (not collect_every_cpu) and re.match("all:?", fields[1])))): + if ((fields[1] == "all") or (fields[1] == "0")): timestamp = int(time.time()) - cpuid=fields[1].replace(":","") - cpuuser=fields[2] - cpunice=fields[3] - cpusystem=fields[4] - cpuinterrupt=fields[6] - cpuidle=fields[-1] - print ("cpu.usr %s %s cpu=%s" % (timestamp, float(cpuuser), cpuid)) - print ("cpu.nice %s %s cpu=%s" % (timestamp, float(cpunice), cpuid)) - print ("cpu.sys %s %s cpu=%s" % (timestamp, float(cpusystem), cpuid)) - print ("cpu.irq %s %s cpu=%s" % (timestamp, float(cpuinterrupt), cpuid)) - print ("cpu.idle %s %s cpu=%s" % (timestamp, float(cpuidle), cpuid)) - - elif(fields[0] == "averages:"): + cpuid = fields[1].replace(b":", b"") + cpuuser = fields[2] + cpunice = fields[3] + cpusystem = fields[4] + cpuinterrupt = fields[6] + cpuidle = fields[-1] + print("cpu.usr %s %s cpu=%s" % (timestamp, float(cpuuser), cpuid)) + print("cpu.nice %s %s cpu=%s" % (timestamp, float(cpunice), cpuid)) + print("cpu.sys %s %s cpu=%s" % (timestamp, float(cpusystem), cpuid)) + print("cpu.irq %s %s cpu=%s" % (timestamp, float(cpuinterrupt), cpuid)) + print("cpu.idle %s %s cpu=%s" % (timestamp, float(cpuidle), cpuid)) + + elif (fields[0] == "averages:"): timestamp = int(time.time()) - print ("load.1m %s %s" % (timestamp, fields[1])) - print ("load.5m %s %s" % (timestamp, fields[2])) - print ("load.15m %s %s" % (timestamp, fields[3])) - - elif (re.match("[0-9]+ processes:",line)): - starting=0 - running=0 - sleeping=0 - stopped=0 - zombie=0 - waiting=0 - lock=0 + print("load.1m %s %s" % (timestamp, fields[1])) + print("load.5m %s %s" % (timestamp, fields[2])) + print("load.15m %s %s" % (timestamp, fields[3])) + + elif (re.match("[0-9]+ processes:", line)): + starting = 0 + running = 0 + sleeping = 0 + stopped = 0 + zombie = 0 + waiting = 0 + lock = 0 for i in range(len(fields)): - if(fields[i] == "starting"): - starting=fields[i-1] - if(fields[i] == "running"): - running=fields[i-1] - if(fields[i] == "sleeping"): - sleeping=fields[i-1] - if(fields[i] == "stopped"): - stopped=fields[i-1] - if(fields[i] == "zombie"): - zombie=fields[i-1] - if(fields[i] == "waiting"): - waiting=fields[i-1] - if(fields[i] == "lock"): - lock=fields[i-1] - print ("ps.all %s %s" % (timestamp, fields[0])) - print ("ps.start %s %s" % (timestamp, starting)) - print ("ps.run %s %s" % (timestamp, running)) - print ("ps.sleep %s %s" % (timestamp, sleeping)) - print ("ps.stop %s %s" % (timestamp, stopped)) - print ("ps.zomb %s %s" % (timestamp, zombie)) - print ("ps.wait %s %s" % (timestamp, waiting)) - print ("ps.lock %s %s" % (timestamp, lock)) - - elif(fields[0] == "Mem:"): - active=0 - inact=0 - wired=0 - cache=0 - buf=0 - free=0 + if (fields[i] == "starting"): + starting = fields[i - 1] + if (fields[i] == "running"): + running = fields[i - 1] + if (fields[i] == "sleeping"): + sleeping = fields[i - 1] + if (fields[i] == "stopped"): + stopped = fields[i - 1] + if (fields[i] == "zombie"): + zombie = fields[i - 1] + if (fields[i] == "waiting"): + waiting = fields[i - 1] + if (fields[i] == "lock"): + lock = fields[i - 1] + print("ps.all %s %s" % (timestamp, fields[0])) + print("ps.start %s %s" % (timestamp, starting)) + print("ps.run %s %s" % (timestamp, running)) + print("ps.sleep %s %s" % (timestamp, sleeping)) + print("ps.stop %s %s" % (timestamp, stopped)) + print("ps.zomb %s %s" % (timestamp, zombie)) + print("ps.wait %s %s" % (timestamp, waiting)) + print("ps.lock %s %s" % (timestamp, lock)) + + elif (fields[0] == "Mem:"): + active = 0 + inact = 0 + wired = 0 + cache = 0 + buf = 0 + free = 0 for i in range(len(fields)): - if(fields[i] == "Active"): - active=convert_to_bytes(fields[i-1]) - if(fields[i] == "Inact"): - inact=convert_to_bytes(fields[i-1]) - if(fields[i] == "Wired"): - wired=convert_to_bytes(fields[i-1]) - if(fields[i] == "Cache"): - cache=convert_to_bytes(fields[i-1]) - if(fields[i] == "Buf"): - buf=convert_to_bytes(fields[i-1]) - if(fields[i] == "Free"): - free=convert_to_bytes(fields[i-1]) - print ("mem.active %s %s" % (timestamp, active)) - print ("mem.inact %s %s" % (timestamp, inact)) - print ("mem.wired %s %s" % (timestamp, wired)) - print ("mem.cache %s %s" % (timestamp, cache)) - print ("mem.buf %s %s" % (timestamp, buf)) - print ("mem.free %s %s" % (timestamp, free)) - - elif(fields[0] == "ARC:"): - total=0 - mru=0 - mfu=0 - anon=0 - header=0 - other=0 + if (fields[i] == "Active"): + active = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Inact"): + inact = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Wired"): + wired = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Cache"): + cache = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Buf"): + buf = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Free"): + free = convert_to_bytes(fields[i - 1]) + print("mem.active %s %s" % (timestamp, active)) + print("mem.inact %s %s" % (timestamp, inact)) + print("mem.wired %s %s" % (timestamp, wired)) + print("mem.cache %s %s" % (timestamp, cache)) + print("mem.buf %s %s" % (timestamp, buf)) + print("mem.free %s %s" % (timestamp, free)) + + elif (fields[0] == "ARC:"): + total = 0 + mru = 0 + mfu = 0 + anon = 0 + header = 0 + other = 0 for i in range(len(fields)): - if(fields[i] == "Total"): - total=convert_to_bytes(fields[i-1]) - if(fields[i] == "MRU"): - mru=convert_to_bytes(fields[i-1]) - if(fields[i] == "MFU"): - mfu=convert_to_bytes(fields[i-1]) - if(fields[i] == "Anon"): - anon=convert_to_bytes(fields[i-1]) - if(fields[i] == "Header"): - header=convert_to_bytes(fields[i-1]) - if(fields[i] == "Other"): - other=convert_to_bytes(fields[i-1]) - print ("arc.total %s %s" % (timestamp, total)) - print ("arc.mru %s %s" % (timestamp, mru)) - print ("arc.mfu %s %s" % (timestamp, mfu)) - print ("arc.anon %s %s" % (timestamp, anon)) - print ("arc.header %s %s" % (timestamp, header)) - print ("arc.other %s %s" % (timestamp, other)) - - elif(fields[0] == "Swap:"): - total=0 - used=0 - free=0 - inuse=0 - inps=0 - outps=0 + if (fields[i] == "Total"): + total = convert_to_bytes(fields[i - 1]) + if (fields[i] == "MRU"): + mru = convert_to_bytes(fields[i - 1]) + if (fields[i] == "MFU"): + mfu = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Anon"): + anon = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Header"): + header = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Other"): + other = convert_to_bytes(fields[i - 1]) + print("arc.total %s %s" % (timestamp, total)) + print("arc.mru %s %s" % (timestamp, mru)) + print("arc.mfu %s %s" % (timestamp, mfu)) + print("arc.anon %s %s" % (timestamp, anon)) + print("arc.header %s %s" % (timestamp, header)) + print("arc.other %s %s" % (timestamp, other)) + + elif (fields[0] == "Swap:"): + total = 0 + used = 0 + free = 0 + inuse = 0 + inps = 0 + outps = 0 for i in range(len(fields)): - if(fields[i] == "Total"): - total=convert_to_bytes(fields[i-1]) - if(fields[i] == "Used"): - used=convert_to_bytes(fields[i-1]) - if(fields[i] == "Free"): - free=convert_to_bytes(fields[i-1]) - if(fields[i] == "Inuse"): - inuse=convert_to_bytes(fields[i-1]) - if(fields[i] == "In"): - inps=convert_to_bytes(fields[i-1])/collection_interval - if(fields[i] == "Out"): - outps=convert_to_bytes(fields[i-1])/collection_interval - print ("swap.total %s %s" % (timestamp, total)) - print ("swap.used %s %s" % (timestamp, used)) - print ("swap.free %s %s" % (timestamp, free)) - print ("swap.inuse %s %s" % (timestamp, inuse)) - print ("swap.inps %s %s" % (timestamp, inps)) - print ("swap.outps %s %s" % (timestamp, outps)) + if (fields[i] == "Total"): + total = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Used"): + used = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Free"): + free = convert_to_bytes(fields[i - 1]) + if (fields[i] == "Inuse"): + inuse = convert_to_bytes(fields[i - 1]) + if (fields[i] == "In"): + inps = convert_to_bytes(fields[i - 1]) / collection_interval + if (fields[i] == "Out"): + outps = convert_to_bytes(fields[i - 1]) / collection_interval + print("swap.total %s %s" % (timestamp, total)) + print("swap.used %s %s" % (timestamp, used)) + print("swap.free %s %s" % (timestamp, free)) + print("swap.inuse %s %s" % (timestamp, inuse)) + print("swap.inps %s %s" % (timestamp, inps)) + print("swap.outps %s %s" % (timestamp, outps)) sys.stdout.flush() @@ -288,5 +295,6 @@ def main(): pass p_top.wait() + if __name__ == "__main__": main() diff --git a/collectors/0/tcollector.py b/collectors/0/tcollector.py index ee44720a..d94422d9 100755 --- a/collectors/0/tcollector.py +++ b/collectors/0/tcollector.py @@ -22,7 +22,6 @@ """ import os -import pwd import resource import sys import time @@ -78,12 +77,12 @@ def stat(self): raise ProcessTerminatedError() rv = {"pid": spl[0], "comm": spl[1], "ppid": spl[3], - "utime": spl[13], "stime": spl[14], "cutime": spl[15], - "cstime": spl[16], "vsize": spl[22], "rss": spl[23]} + "utime": spl[13], "stime": spl[14], "cutime": spl[15], + "cstime": spl[16], "vsize": spl[22], "rss": spl[23]} # supported since Kernel 2.6.24 if len(spl) > 43: - rv.update({"guest_time": spl[42], - "cguest_time": spl[43]}) + rv.update({"guest_time": spl[42], + "cguest_time": spl[43]}) return rv @@ -92,6 +91,7 @@ class ProcessTable(object): Process informations are gathered from /proc. """ + def __init__(self): self.processes = {} self.update() @@ -114,7 +114,8 @@ def update(self): def filter(self, cond): """ Return processes for that the function cond evaluates to true. """ - return filter(cond, self.processes.values()) + return list(filter(cond, list(self.processes.values()))) + def collect_tcollect_stats(processes): # print a msg and do nothing if the parent process isn't tcollector @@ -123,7 +124,7 @@ def collect_tcollect_stats(processes): tcol_process = Process(tcol_pid) if not "tcollector.py" in " ".join(tcol_process.cmdline): sys.stderr.write("Parent Process %s isn't a tcollector instance\n" % - tcol_pid) + tcol_pid) return tcollect_procs = processes.filter(lambda p: p.ppid == tcol_pid) @@ -154,9 +155,9 @@ def collect_tcollect_stats(processes): print("tcollector.cputime %s %s name=%s" % (ts, cpu_time, comm)) print("tcollector.mem_bytes %s %s name=%s type=vsize" % - (ts, s["vsize"], comm)) + (ts, s["vsize"], comm)) print("tcollector.mem_bytes %s %s name=%s type=rss" % - (ts, int(s["rss"]) * resource.getpagesize(), comm)) + (ts, int(s["rss"]) * resource.getpagesize(), comm)) def main(): @@ -169,5 +170,6 @@ def main(): time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": main() diff --git a/collectors/0/tcp_bridge.py b/collectors/0/tcp_bridge.py index a723d5fb..0c1e7918 100755 --- a/collectors/0/tcp_bridge.py +++ b/collectors/0/tcp_bridge.py @@ -18,12 +18,16 @@ import sys import time from collectors.lib import utils -from thread import * + +try: + from _thread import start_new_thread +except ImportError: + from thread import start_new_thread try: from collectors.etc import tcp_bridge_conf except ImportError: - print >> sys.stderr, 'unable to import tcp_bridge_conf' + utils.err('unable to import tcp_bridge_conf') tcp_bridge_conf = None HOST = '127.0.0.1' @@ -40,21 +44,22 @@ # buffered stdout seems to break metrics out = os.fdopen(sys.stdout.fileno(), 'w', 0) + def main(): if not (tcp_bridge_conf and tcp_bridge_conf.enabled()): - print >> sys.stderr, 'not enabled, or tcp_bridge_conf unavilable' + utils.err('not enabled, or tcp_bridge_conf unavilable') sys.exit(13) utils.drop_privileges() def printm(string, time, value): - out.write(m_namespace+string+' '+str(time)+' '+str(value)+'\n') + out.write(m_namespace + string + ' ' + str(time) + ' ' + str(value) + '\n') def printmetrics(): global m_delay global m_last ts = int(time.time()) - if ts > m_last+m_delay: + if ts > m_last + m_delay: printm('lines_read', ts, m_lines) printm('connections_processed', ts, m_connections) printm('processing_time', ts, m_ptime) @@ -103,7 +108,7 @@ def removePut(line): sock.bind((HOST, PORT)) sock.listen(1) - except socket.error, msg: + except socket.error as msg: utils.err('could not open socket: %s' % msg) sys.exit(1) @@ -124,6 +129,7 @@ def removePut(line): finally: sock.close() + if __name__ == "__main__": main() diff --git a/collectors/0/udp_bridge.py b/collectors/0/udp_bridge.py index cfec4537..5f5ebc99 100755 --- a/collectors/0/udp_bridge.py +++ b/collectors/0/udp_bridge.py @@ -19,17 +19,18 @@ from collectors.lib import utils try: - from collectors.etc import udp_bridge_conf + from collectors.etc import udp_bridge_conf except ImportError: - udp_bridge_conf = None + udp_bridge_conf = None HOST = '127.0.0.1' PORT = 8953 SIZE = 8192 + def main(): if not (udp_bridge_conf and udp_bridge_conf.enabled()): - sys.exit(13) + sys.exit(13) utils.drop_privileges() def removePut(line): @@ -40,11 +41,11 @@ def removePut(line): try: if (udp_bridge_conf and udp_bridge_conf.usetcp()): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) else: - sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind((HOST, PORT)) - except socket.error, msg: + except socket.error as msg: utils.err('could not open socket: %s' % msg) sys.exit(1) @@ -64,7 +65,7 @@ def removePut(line): if not data: utils.err("invalid data") break - print data + print(data) now = int(time.time()) if now > flush_timeout: sys.stdout.flush() @@ -75,6 +76,7 @@ def removePut(line): finally: sock.close() + if __name__ == "__main__": main() diff --git a/collectors/0/varnishstat.py b/collectors/0/varnishstat.py index b2806946..b5931a34 100755 --- a/collectors/0/varnishstat.py +++ b/collectors/0/varnishstat.py @@ -24,7 +24,7 @@ from collectors.lib import utils -interval = 10 # seconds +interval = 10 # seconds # If you would rather use the timestamp returned by varnishstat instead of a # local timestamp, then change this value to "True" @@ -41,6 +41,7 @@ # Collect all metrics vstats = "all" + # Collect metrics a la carte # vstats = frozenset([ # "client_conn", @@ -52,47 +53,47 @@ # ]) def main(): - utils.drop_privileges() - bad_regex = re.compile("[,()]+") # avoid forbidden by TSD symbols - - while True: - try: - if vstats == "all": - stats = subprocess.Popen( - ["varnishstat", "-1", "-j"], - stdout=subprocess.PIPE, - ) - else: - fields = ",".join(vstats) - stats = subprocess.Popen( - ["varnishstat", "-1", "-f" + fields, "-j"], - stdout=subprocess.PIPE, - ) - except OSError, e: - # Die and signal to tcollector not to run this script. - sys.stderr.write("Error: %s\n" % e) - sys.exit(13) - - metrics = "" - for line in stats.stdout.readlines(): - metrics += line - metrics = json.loads(metrics) - - timestamp = "" - if use_varnishstat_timestamp: - pattern = "%Y-%m-%dT%H:%M:%S" - timestamp = int(time.mktime(time.strptime(metrics['timestamp'], pattern))) - else: - timestamp = time.time() - - for k, v in metrics.iteritems(): - if k != "timestamp" and None == bad_regex.search(k): - metric_name = metric_prefix + "." + k - print "%s %d %s %s" % \ - (metric_name, timestamp, v['value'], ",".join(tags)) - - sys.stdout.flush() - time.sleep(interval) + utils.drop_privileges() + bad_regex = re.compile("[,()]+") # avoid forbidden by TSD symbols + + while True: + try: + if vstats == "all": + stats = subprocess.Popen( + ["varnishstat", "-1", "-j"], + stdout=subprocess.PIPE, + ) + else: + fields = ",".join(vstats) + stats = subprocess.Popen( + ["varnishstat", "-1", "-f" + fields, "-j"], + stdout=subprocess.PIPE, + ) + except OSError as e: + # Die and signal to tcollector not to run this script. + sys.stderr.write("Error: %s\n" % e) + sys.exit(13) + + metrics = "" + for line in stats.stdout.readlines(): + metrics += line + metrics = json.loads(metrics) + + timestamp = "" + if use_varnishstat_timestamp: + pattern = "%Y-%m-%dT%H:%M:%S" + timestamp = int(time.mktime(time.strptime(metrics['timestamp'], pattern))) + else: + timestamp = time.time() + + for k, v in metrics.items(): + if k != "timestamp" and None == bad_regex.search(k): + metric_name = metric_prefix + "." + k + print("%s %d %s %s" % (metric_name, timestamp, v['value'], ",".join(tags))) + + sys.stdout.flush() + time.sleep(interval) + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/zabbix_bridge.py b/collectors/0/zabbix_bridge.py index 8be5d03c..7e31de02 100755 --- a/collectors/0/zabbix_bridge.py +++ b/collectors/0/zabbix_bridge.py @@ -70,14 +70,17 @@ def main(): cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,)) row = cachecur.fetchone() if (row is not None): - print "zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3]) - if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals + print("zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3])) + if ((int(time.time()) - sample_last_ts) > settings[ + 'internal_metric_interval']): # Sample internal metrics @ 10s intervals sample_last_ts = int(time.time()) - print "tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos) - print "tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss) - print "tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock'])) + print("tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos)) + print("tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss)) + print("tcollector.zabbix_bridge.timestamp_drift %d %s" % ( + sample_last_ts, (sample_last_ts - r['clock']))) if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']): - print "tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss)) + print("tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % ( + sample_last_ts, (key_lookup_miss - last_key_lookup_miss))) cachecur.execute('DROP TABLE zabbix_cache') cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)') @@ -96,7 +99,6 @@ def main(): sys.stdin.close() sys.exit(main()) - ## Sample zabbix debug dump: # === WriteRowsEvent === # Date: 2014-08-04T03:47:37 diff --git a/collectors/0/zfsiostats.py b/collectors/0/zfsiostats.py index 96d6bb83..cc80578c 100755 --- a/collectors/0/zfsiostats.py +++ b/collectors/0/zfsiostats.py @@ -46,51 +46,54 @@ except ImportError: zfsiostats_conf = None -DEFAULT_COLLECTION_INTERVAL=15 -DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES=20 -DEFAULT_REPORT_DISKS_IN_VDEVS=False +DEFAULT_COLLECTION_INTERVAL = 15 +DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES = 20 +DEFAULT_REPORT_DISKS_IN_VDEVS = False + def convert_to_bytes(string): """Take a string in the form 1234K, and convert to bytes""" factors = { - "K": 1024, - "M": 1024 * 1024, - "G": 1024 * 1024 * 1024, - "T": 1024 * 1024 * 1024 * 1024, - "P": 1024 * 1024 * 1024 * 1024 * 1024, - "E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, + "K": 1024, + "M": 1024 * 1024, + "G": 1024 * 1024 * 1024, + "T": 1024 * 1024 * 1024 * 1024, + "P": 1024 * 1024 * 1024 * 1024 * 1024, + "E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, } if string == "-": return -1 for f, fm in factors.items(): if string.endswith(f): number = float(string[:-1]) number = number * fm - return long(number) - return long(string) + return int(number) + return int(string) + def convert_wo_prefix(string): """Take a string in the form 1234K, and convert without metric prefix""" factors = { - "K": 1000, - "M": 1000 * 1000, - "G": 1000 * 1000 * 1000, - "T": 1000 * 1000 * 1000 * 1000, - "P": 1000 * 1000 * 1000 * 1000 * 1000, - "E": 1000 * 1000 * 1000 * 1000 * 1000 * 1000, + "K": 1000, + "M": 1000 * 1000, + "G": 1000 * 1000 * 1000, + "T": 1000 * 1000 * 1000 * 1000, + "P": 1000 * 1000 * 1000 * 1000 * 1000, + "E": 1000 * 1000 * 1000 * 1000 * 1000 * 1000, } if string == "-": return -1 for f, fm in factors.items(): if string.endswith(f): number = float(string[:-1]) number = number * fm - return long(number) - return long(string) + return int(number) + return int(string) + -def extract_info(line,report_disks_in_vdevs): +def extract_info(line, report_disks_in_vdevs): (poolname, - alloc, free, - read_issued, write_issued, - read_throughput, write_throughput) = line.split() + alloc, free, + read_issued, write_issued, + read_throughput, write_throughput) = line.split() s_io = {} # magnitudeless variable @@ -108,11 +111,12 @@ def extract_info(line,report_disks_in_vdevs): s_df["free"] = convert_to_bytes(free) / 1024 if ((s_df["used"] < 0) or (s_df["free"] < 0)): s_df = {} - if(not report_disks_in_vdevs): + if (not report_disks_in_vdevs): s_io = {} return poolname, s_df, s_io + T_START = 1 T_HEADERS = 2 T_SEPARATOR = 3 @@ -122,22 +126,25 @@ def extract_info(line,report_disks_in_vdevs): T_LEG = 7 signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """zfsiostats main loop""" global signal_received - collection_interval=DEFAULT_COLLECTION_INTERVAL - report_capacity_every_x_times=DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES - report_disks_in_vdevs=DEFAULT_REPORT_DISKS_IN_VDEVS - if(zfsiostats_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + report_capacity_every_x_times = DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES + report_disks_in_vdevs = DEFAULT_REPORT_DISKS_IN_VDEVS + if (zfsiostats_conf): config = zfsiostats_conf.get_config() - collection_interval=config['collection_interval'] - report_capacity_every_x_times=config['report_capacity_every_x_times'] - report_disks_in_vdevs=config['report_disks_in_vdevs'] + collection_interval = config['collection_interval'] + report_capacity_every_x_times = config['report_capacity_every_x_times'] + report_disks_in_vdevs = config['report_disks_in_vdevs'] signal.signal(signal.SIGTERM, handlesignal) signal.signal(signal.SIGINT, handlesignal) @@ -147,14 +154,14 @@ def main(): ["zpool", "iostat", "-v", str(collection_interval)], stdout=subprocess.PIPE, ) - except OSError, e: + except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise firstloop = True - report_capacity = (report_capacity_every_x_times-1) + report_capacity = (report_capacity_every_x_times - 1) lastleg = 0 ltype = None timestamp = int(time.time()) @@ -168,7 +175,7 @@ def main(): while signal_received is None: try: line = p_zpool.stdout.readline() - except (IOError, OSError), e: + except (IOError, OSError) as e: if e.errno in (errno.EINTR, errno.EAGAIN): break raise @@ -203,7 +210,7 @@ def main(): ltype = T_DEVICE else: # must be a pool name - #assert ltype == T_SEPARATOR, \ + # assert ltype == T_SEPARATOR, \ # "expecting last state T_SEPARATOR, now got %s" % ltype if ltype == T_SEPARATOR: parentpoolname = "" @@ -211,19 +218,19 @@ def main(): if ltype == T_START: for x in ( - capacity_stats_pool, capacity_stats_device, - io_stats_pool, io_stats_device, - ): + capacity_stats_pool, capacity_stats_device, + io_stats_pool, io_stats_device, + ): x.clear() timestamp = int(time.time()) elif ltype == T_POOL: line = line.strip() - poolname, s_df, s_io = extract_info(line,report_disks_in_vdevs) + poolname, s_df, s_io = extract_info(line, report_disks_in_vdevs) if parentpoolname == "": parentpoolname = poolname else: - poolname=parentpoolname+"."+poolname + poolname = parentpoolname + "." + poolname capacity_stats_pool[poolname] = s_df io_stats_pool[poolname] = s_io # marker for leg @@ -232,13 +239,13 @@ def main(): elif ltype == T_LEG: last_leg = last_leg + 1 line = line.strip() - devicename, s_df, s_io = extract_info(line,report_disks_in_vdevs) + devicename, s_df, s_io = extract_info(line, report_disks_in_vdevs) capacity_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_df io_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_io elif ltype == T_DEVICE: line = line.strip() - devicename, s_df, s_io = extract_info(line,report_disks_in_vdevs) + devicename, s_df, s_io = extract_info(line, report_disks_in_vdevs) capacity_stats_device["%s %s" % (poolname, devicename)] = s_df io_stats_device["%s %s" % (poolname, devicename)] = s_io @@ -246,17 +253,17 @@ def main(): if report_capacity_every_x_times > 0: report_capacity += 1 if report_capacity == report_capacity_every_x_times: - report_capacity=0 + report_capacity = 0 for poolname, stats in capacity_stats_pool.items(): fm = "zfs.df.pool.kb.%s %d %s pool=%s" for statname, statnumber in stats.items(): - print fm % (statname, timestamp, statnumber, poolname) + print(fm % (statname, timestamp, statnumber, poolname)) for devicename, stats in capacity_stats_device.items(): fm = "zfs.df.device.kb.%s %d %s device=%s pool=%s" poolname, devicename = devicename.split(" ", 1) for statname, statnumber in stats.items(): - print fm % (statname, timestamp, statnumber, - devicename, poolname) + print(fm % (statname, timestamp, statnumber, + devicename, poolname)) if firstloop: # this flag prevents printing out of the data in the first loop # which is a since-boot summary similar to iostat @@ -266,13 +273,13 @@ def main(): for poolname, stats in io_stats_pool.items(): fm = "zfs.io.pool.%s %d %s pool=%s" for statname, statnumber in stats.items(): - print fm % (statname, timestamp, statnumber, poolname) + print(fm % (statname, timestamp, statnumber, poolname)) for devicename, stats in io_stats_device.items(): fm = "zfs.io.device.%s %d %s device=%s pool=%s" poolname, devicename = devicename.split(" ", 1) for statname, statnumber in stats.items(): - print fm % (statname, timestamp, statnumber, - devicename, poolname) + print(fm % (statname, timestamp, statnumber, + devicename, poolname)) sys.stdout.flush() if signal_received is None: @@ -283,6 +290,6 @@ def main(): pass p_zpool.wait() + if __name__ == "__main__": main() - diff --git a/collectors/0/zfsolkernstats.py b/collectors/0/zfsolkernstats.py index b5389486..b68a60ed 100755 --- a/collectors/0/zfsolkernstats.py +++ b/collectors/0/zfsolkernstats.py @@ -29,6 +29,7 @@ zfs.mem.arc ''' + # /proc/spl/slab has several fields. we only care about the sizes # and the allocation sizes for the slabs # /proc/spl/kstat/zfs/arcstats is a table. we only care about the data column @@ -41,10 +42,10 @@ def main(): try: f_slab = open("/proc/spl/kmem/slab", "r") f_arcstats = open("/proc/spl/kstat/zfs/arcstats", "r") - except IOError, e: + except IOError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise while True: @@ -63,12 +64,12 @@ def main(): typ = typ.group(1) else: typ = name - print ("zfs.mem.slab.size %d %d type=%s objsize=%d" % + print("zfs.mem.slab.size %d %d type=%s objsize=%d" % (ts, size, typ, objsize) - ) - print ("zfs.mem.slab.alloc %d %d type=%s objsize=%d" % + ) + print("zfs.mem.slab.alloc %d %d type=%s objsize=%d" % (ts, alloc, typ, objsize) - ) + ) for n, line in enumerate(f_arcstats): if n < 2: @@ -76,13 +77,13 @@ def main(): line = line.split() name, _, data = line data = int(data) - print ("zfs.mem.arc.%s %d %d" % + print("zfs.mem.arc.%s %d %d" % (name, ts, data) - ) + ) sys.stdout.flush() time.sleep(interval) + if __name__ == "__main__": main() - diff --git a/collectors/0/zookeeper.py b/collectors/0/zookeeper.py index 6da5ac91..ff66389e 100755 --- a/collectors/0/zookeeper.py +++ b/collectors/0/zookeeper.py @@ -48,7 +48,8 @@ "zk_outstanding_requests", "zk_approximate_data_size", "zk_open_file_descriptor_count", - ]) +]) + def scan_zk_instances(): """ @@ -68,7 +69,7 @@ def scan_zk_instances(): except OSError: utils.err("netstat is not in PATH") return instances - except CalledProcessError, err: + except CalledProcessError as err: utils.err("Error: %s" % err) for line in listen_sock.split("\n"): @@ -92,23 +93,25 @@ def scan_zk_instances(): sock.settimeout(0.5) sock.send("ruok\n") data = sock.recv(1024) - except Exception, err: + except Exception as err: utils.err(err) finally: - if sock: + if sock: sock.close() - if data == "imok": + if data == "imok": instances.append([ip, port, tcp_version]) data = "" - except Exception, err: + except Exception as err: utils.err(err) finally: fd.close() - return instances + return instances + def print_stat(metric, ts, value, tags=""): if value is not None: - print "zookeeper.%s %i %s %s" % (metric, ts, value, tags) + print("zookeeper.%s %i %s %s" % (metric, ts, value, tags)) + def connect_socket(tcp_version, port): sock = None @@ -120,10 +123,11 @@ def connect_socket(tcp_version, port): ipaddr = '127.0.0.1' try: sock.connect((ipaddr, port)) - except Exception, err: + except Exception as err: utils.err(err) return sock + def main(): if USER != "root": utils.drop_privileges(user=USER) @@ -160,5 +164,6 @@ def main(): time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/300/aws_cloudwatch_stats.py b/collectors/300/aws_cloudwatch_stats.py index 910fae49..a4a82bc6 100755 --- a/collectors/300/aws_cloudwatch_stats.py +++ b/collectors/300/aws_cloudwatch_stats.py @@ -4,11 +4,16 @@ import time import datetime import re -import json -from collections import OrderedDict + import exceptions import threading -import Queue + +is_py2 = sys.version[0] == '2' +if is_py2: + import Queue as queue +else: + import queue as queue + from time import mktime from collectors.lib import utils from collectors.etc import aws_cloudwatch_conf @@ -26,44 +31,49 @@ COLLECTION_INTERVAL = int(path.split('/')[-1]) if COLLECTION_INTERVAL == 0: - sys.stderr.write("AWS Cloudwatch Stats is not a long running collector\n") - sys.exit(13) + sys.stderr.write("AWS Cloudwatch Stats is not a long running collector\n") + sys.exit(13) if COLLECTION_INTERVAL < 60: - sys.stderr.write("AWS Cloudwatch Stats is an heavy collector and should not be run more than once per minute.\n") - sys.exit(13) + sys.stderr.write("AWS Cloudwatch Stats is an heavy collector and should not be run more than once per minute.\n") + sys.exit(13) STATISTICS = frozenset([ - 'Minimum', - 'Maximum', - 'Average', - 'Sum', - 'SampleCount' - ]) + 'Minimum', + 'Maximum', + 'Average', + 'Sum', + 'SampleCount' +]) + +sendQueue = queue.Queue() -sendQueue = Queue.Queue() def validate_config(): access_key, secret_access_key = aws_cloudwatch_conf.get_accesskey_secretkey() if access_key == '' or secret_access_key == '': - sys.stderr.write("Cloudwatch Collector is not configured\n") - sys.exit(13) + sys.stderr.write("Cloudwatch Collector is not configured\n") + sys.exit(13) if not aws_cloudwatch_conf.enabled: - sys.stderr.write("Cloudwatch Collector is not enabled\n") - sys.exit(13) + sys.stderr.write("Cloudwatch Collector is not enabled\n") + sys.exit(13) + def cloudwatch_connect_to_region(region): access_key, secret_access_key = aws_cloudwatch_conf.get_accesskey_secretkey() try: - conn = boto.ec2.cloudwatch.connect_to_region(region, aws_access_key_id=access_key, aws_secret_access_key=secret_access_key) + conn = boto.ec2.cloudwatch.connect_to_region(region, aws_access_key_id=access_key, + aws_secret_access_key=secret_access_key) except: - print "Unexpected error:", sys.exc_info()[0] + print("Unexpected error:", sys.exc_info()[0]) else: return conn + def cloudwatch_list_metrics(conn): return conn.list_metrics() + def cloudwatch_query_metric(region, metric, statistic): end = datetime.datetime.utcnow() start = end - datetime.timedelta(seconds=COLLECTION_INTERVAL) @@ -74,21 +84,25 @@ def cloudwatch_query_metric(region, metric, statistic): if len(datapoints) > 0: for datapoint in datapoints: timestamp = format_timestamp(str(datapoint['Timestamp'])) - value = int(datapoint[statistic]) - metric_full = " %s.%s.%s" % (metric.namespace.lower().replace('/','.'), metric_name, statistic.lower()) - output = "%s.%s.%s %s %s %s" % (metric.namespace.lower().replace('/','.'), metric_name, statistic.lower(), str(timestamp), str(value), tags) + value = int(datapoint[statistic]) + metric_full = " %s.%s.%s" % (metric.namespace.lower().replace('/', '.'), metric_name, statistic.lower()) + output = "%s.%s.%s %s %s %s" % ( + metric.namespace.lower().replace('/', '.'), metric_name, statistic.lower(), str(timestamp), str(value), + tags) if validate_line_parses(output): sendQueue.put({'timestamp': timestamp, 'output': output}) + def format_timestamp(ts): st = time.strptime(ts, "%Y-%m-%d %H:%M:%S") dt = datetime.datetime.fromtimestamp(mktime(st)) return dt.strftime("%s") + def build_tag_list(metric_name, region, dimensions): tags = "region=" + str(region) - for tagk,tagv in dimensions.iteritems(): + for tagk, tagv in dimensions.items(): tagkey = str(tagk) tagval = str(tagv[0]) tags += " %s=%s" % (tagkey, tagval) @@ -103,35 +117,39 @@ def build_tag_list(metric_name, region, dimensions): return metric_name.strip().lower(), tags.strip().lower() + def ec2_connect_to_region(region): access_key, secret_access_key = aws_cloudwatch_conf.get_accesskey_secretkey() return boto.ec2.connect_to_region(region, aws_access_key_id=access_key, aws_secret_access_key=secret_access_key) + def ec2_list_regions(): ec2_regions = [] for i in boto.ec2.cloudwatch.regions(): ec2_regions.append(str(i.name)) return ec2_regions + def handle_region(region, statistic): try: -# sys.stderr.write("starting region " + region + "," + statistic + "\n") + # sys.stderr.write("starting region " + region + "," + statistic + "\n") region_conn = cloudwatch_connect_to_region(region) metrics = cloudwatch_list_metrics(region_conn) for metric in metrics: cloudwatch_query_metric(region, metric, statistic) - except boto.exception.BotoServerError, e: - # sys.stderr.write("finished region " + region + "," + statistic + "\n") + except boto.exception.BotoServerError as e: + # sys.stderr.write("finished region " + region + "," + statistic + "\n") pass except exceptions.KeyboardInterrupt: return 0 except: sys.stderr.write("failed region " + region + "," + statistic + "\n") raise -# else: -# sys.stderr.write("finished region " + region + "," + statistic + "\n") + # else: + # sys.stderr.write("finished region " + region + "," + statistic + "\n") return 1 + def send_metrics(): sys.stderr.write("Processing sendQueue \n") datapoints = {} @@ -145,26 +163,28 @@ def send_metrics(): datapoints[timestamp].append(output) sendQueue.task_done() sys.stderr.write("Queue Emptied, sorting output") - for outputs in sorted(datapoints.iteritems(), key=lambda x: x[1]): + for outputs in sorted(iter(datapoints.items()), key=lambda x: x[1]): for output in outputs: for t in output: - print t + print(t) except exceptions.KeyboardInterrupt: return 0 + # Uses the same code as tcollector here def validate_line_parses(line): - parsed = re.match('^([-_./a-zA-Z0-9]+)\s+' # Metric name. - '(\d+)\s+' # Timestamp. - '(\S+?)' # Value (int or float). - '((?:\s+[-_./a-zA-Z0-9]+=[-_./a-zA-Z0-9]+)*)$', # Tags - line) + parsed = re.match('^([-_./a-zA-Z0-9]+)\s+' # Metric name. + '(\d+)\s+' # Timestamp. + '(\S+?)' # Value (int or float). + '((?:\s+[-_./a-zA-Z0-9]+=[-_./a-zA-Z0-9]+)*)$', # Tags + line) if parsed is None: sys.stderr.write("invalid data: %s \n" % (line)) return False metric, timestamp, value, tags = parsed.groups() return True + def main(): try: utils.drop_privileges() @@ -172,7 +192,7 @@ def main(): regions = ec2_list_regions() for reg in regions: for statistic in STATISTICS: - t = threading.Thread(target=handle_region, kwargs={"region":reg, "statistic":statistic}) + t = threading.Thread(target=handle_region, kwargs={"region": reg, "statistic": statistic}) t.start() while threading.activeCount() > 1: time.sleep(1) @@ -183,5 +203,6 @@ def main(): if not sendQueue.empty(): send_metrics() + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/900/zabbix_bridge_cache.py b/collectors/900/zabbix_bridge_cache.py index 08d44698..3cc05aa9 100755 --- a/collectors/900/zabbix_bridge_cache.py +++ b/collectors/900/zabbix_bridge_cache.py @@ -21,10 +21,11 @@ import sqlite3 import sys import time + try: import pymysql except ImportError: - pymysql = None # This is handled gracefully in main() + pymysql = None # This is handled gracefully in main() from collectors.etc import zabbix_bridge_conf from collectors.lib import utils @@ -50,20 +51,20 @@ def main(): else: utils.err("Zabbix bridge SQLite DB exists @ %s" % (db_filename)) - dbzbx = pymysql.connect(**settings['mysql']) zbxcur = dbzbx.cursor() - zbxcur.execute("SELECT i.itemid, i.key_, h.host, h2.host AS proxy FROM items i JOIN hosts h ON i.hostid=h.hostid LEFT JOIN hosts h2 ON h2.hostid=h.proxy_hostid") + zbxcur.execute( + "SELECT i.itemid, i.key_, h.host, h2.host AS proxy FROM items i JOIN hosts h ON i.hostid=h.hostid LEFT JOIN hosts h2 ON h2.hostid=h.proxy_hostid") # Translation of item key_ # Note: http://opentsdb.net/docs/build/html/user_guide/writing.html#metrics-and-tags disallow = re.compile(settings['disallow']) cachecur = dbcache.cursor() print('tcollector.zabbix_bridge.deleterows %d %s' % - (int(time.time()), cachecur.execute('DELETE FROM zabbix_cache').rowcount)) + (int(time.time()), cachecur.execute('DELETE FROM zabbix_cache').rowcount)) rowcount = 0 for row in zbxcur: cachecur.execute('''INSERT INTO zabbix_cache(id, key, host, proxy) VALUES (?,?,?,?)''', - (row[0], re.sub(disallow, '_', row[1]), re.sub(disallow, '_', row[2]), row[3])) + (row[0], re.sub(disallow, '_', row[1]), re.sub(disallow, '_', row[2]), row[3])) rowcount += 1 print('tcollector.zabbix_bridge.rows %d %s' % (int(time.time()), rowcount)) diff --git a/collectors/etc/metric_naming.py b/collectors/etc/metric_naming.py index f4fd2bd7..dfe033ba 100644 --- a/collectors/etc/metric_naming.py +++ b/collectors/etc/metric_naming.py @@ -1,7 +1,7 @@ #!/usr/bin/env python def print_if_apptuit_standard_metric(metric, mapping, timestamp, value, tags=None, tags_str=None): - if metric not in mapping["metrics"].keys(): + if metric not in list(mapping["metrics"].keys()): return new_metric_name = mapping["metrics"][metric]["standard_name"] new_metric_tags_str = "" @@ -24,5 +24,5 @@ def print_if_apptuit_standard_metric(metric, mapping, timestamp, value, tags=Non if tags_str is not None: new_metric_tags_str = new_metric_tags_str.strip() new_metric_tags_str += " " + tags_str.strip() - print ("%s %d %s %s" - % (new_metric_name, timestamp, value, new_metric_tags_str)) \ No newline at end of file + print("%s %d %s %s" + % (new_metric_name, timestamp, value, new_metric_tags_str)) diff --git a/collectors/etc/mysqlconf.py b/collectors/etc/mysqlconf.py index ffb687ca..dfb04401 100644 --- a/collectors/etc/mysqlconf.py +++ b/collectors/etc/mysqlconf.py @@ -4,8 +4,10 @@ MYSQL_CONFIG = yaml_conf.load_collector_configuration('mysql.yml')['collector']['config'] + def get_db_hosts(): - return MYSQL_CONFIG["remote_hosts"].keys() + return list(MYSQL_CONFIG["remote_hosts"].keys()) + def get_db_connection_properties(host): return (MYSQL_CONFIG["remote_hosts"][host]["connect_host"], MYSQL_CONFIG["remote_hosts"][host]["connect_port"], @@ -15,4 +17,4 @@ def get_db_connection_properties(host): def get_db_custom_tags(host): if "tags" not in MYSQL_CONFIG["remote_hosts"][host]: return None - return MYSQL_CONFIG["remote_hosts"][host]["tags"] \ No newline at end of file + return MYSQL_CONFIG["remote_hosts"][host]["tags"] diff --git a/collectors/lib/hadoop_http.py b/collectors/lib/hadoop_http.py index ed09c711..681324b1 100644 --- a/collectors/lib/hadoop_http.py +++ b/collectors/lib/hadoop_http.py @@ -12,11 +12,19 @@ # of the GNU Lesser General Public License along with this program. If not, # see . -import httplib +import sys + +is_py2 = sys.version[0] == '2' +if is_py2: + import httplib as httplib +else: + import http.client as httplib + try: import json except ImportError: json = None + try: from collections import OrderedDict # New in Python 2.7 except ImportError: @@ -28,6 +36,7 @@ "name" ) + class HadoopHttp(object): def __init__(self, service, daemon, host, port, uri="/jmx"): self.service = service @@ -59,16 +68,16 @@ def poll(self): for bean in json_arr: if (not bean['name']) or (not "name=" in bean['name']): continue - #split the name string + # split the name string context = bean['name'].split("name=")[1].split(",sub=") # Create a set that keeps the first occurrence - context = OrderedDict.fromkeys(context).keys() + context = list(OrderedDict.fromkeys(context).keys()) # lower case and replace spaces. context = [c.lower().replace(" ", "_") for c in context] # don't want to include the service or daemon twice context = [c for c in context if c != self.service and c != self.daemon] - for key, value in bean.iteritems(): + for key, value in bean.items(): if key in EXCLUDED_KEYS: continue if not is_numeric(value): @@ -78,11 +87,12 @@ def poll(self): def emit_metric(self, context, current_time, metric_name, value, tag_dict=None): if not tag_dict: - print "%s.%s.%s.%s %d %d" % (self.service, self.daemon, ".".join(context), metric_name, current_time, value) + print("%s.%s.%s.%s %d %d" % ( + self.service, self.daemon, ".".join(context), metric_name, current_time, value)) else: - tag_string = " ".join([k + "=" + v for k, v in tag_dict.iteritems()]) - print "%s.%s.%s.%s %d %d %s" % \ - (self.service, self.daemon, ".".join(context), metric_name, current_time, value, tag_string) + tag_string = " ".join([k + "=" + v for k, v in tag_dict.items()]) + print("%s.%s.%s.%s %d %d %s" % \ + (self.service, self.daemon, ".".join(context), metric_name, current_time, value, tag_string)) def emit(self): pass diff --git a/collectors/lib/utils.py b/collectors/lib/utils.py index 0259e6b7..0260d6fd 100644 --- a/collectors/lib/utils.py +++ b/collectors/lib/utils.py @@ -14,6 +14,8 @@ """Common utility functions shared for Python collectors""" +from __future__ import print_function + import os import stat import pwd @@ -42,7 +44,8 @@ def is_sockfile(path): """Returns whether or not the given path is a socket file.""" try: s = os.stat(path) - except OSError, (no, e): + except OSError as os_error: + (no, e) = os_error.args if no == errno.ENOENT: return False err("warning: couldn't stat(%r): %s" % (path, e)) @@ -50,9 +53,9 @@ def is_sockfile(path): return s.st_mode & stat.S_IFSOCK == stat.S_IFSOCK -def err(msg): - print >> sys.stderr, msg +def err(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) def is_numeric(value): - return isinstance(value, (int, long, float)) + return isinstance(value, (int, float)) diff --git a/eos/collectors/eos.py b/eos/collectors/eos.py index ea0c7a15..5268a315 100755 --- a/eos/collectors/eos.py +++ b/eos/collectors/eos.py @@ -13,9 +13,9 @@ try: - import eossdk + import eossdk except ImportError: - eossdk = None + eossdk = None import sys import time @@ -23,174 +23,173 @@ class IntfCounterCollector(eossdk.AgentHandler, eossdk.TimeoutHandler): - - intf_types = frozenset([eossdk.INTF_TYPE_ETH, - eossdk.INTF_TYPE_MANAGEMENT, - eossdk.INTF_TYPE_LAG]) - - def __init__(self, agent_mgr, timeout_mgr, intf_mgr, - intf_counter_mgr, eth_phy_intf_counter_mgr): - self.intf_mgr_ = intf_mgr - self.intf_counter_mgr_ = intf_counter_mgr - self.eth_phy_intf_counter_mgr_ = eth_phy_intf_counter_mgr - self.interval_ = 30 - eossdk.AgentHandler.__init__(self, agent_mgr) - eossdk.TimeoutHandler.__init__(self, timeout_mgr) - - def on_initialized(self): - # Schedule ourselves to run immediately - self.timeout_time_is(eossdk.now()) - - def on_timeout(self): - for intf_id in self.intf_mgr_.intf_iter(): - if intf_id.intf_type() in self.intf_types: - self.printIntfCounters(intf_id) - sys.stdout.flush() - self.timeout_time_is(eossdk.now() + self.interval_) - - def printIntfCounters(self, intf_id): - ts = int(time.time()) - - self.intf_counter_mgr_.counters(intf_id) - intf_counters = self.intf_counter_mgr_.counters(intf_id) - counters = [ - ("ucastPkts", {"direction" : "out"}, - intf_counters.out_ucast_pkts()), - ("multicastPkts", {"direction" : "out"}, - intf_counters.out_multicast_pkts()), - ("broadcastPkts", {"direction" : "out"}, - intf_counters.out_broadcast_pkts()), - ("ucastPkts", {"direction" : "in"}, - intf_counters.in_ucast_pkts()), - ("multicastPkts", {"direction" : "in"}, - intf_counters.in_multicast_pkts()), - ("broadcastPkts", {"direction" : "in"}, - intf_counters.in_broadcast_pkts()), - ("octets", {"direction" : "out"}, - intf_counters.out_octets()), - ("octets", {"direction" : "in"}, - intf_counters.in_octets()), - ("discards", {"direction" : "out"}, - intf_counters.out_discards()), - ("errors", {"direction" : "out"}, - intf_counters.out_errors()), - ("discards", {"direction" : "in"}, - intf_counters.in_discards()), - ("errors", {"direction" : "in"}, - intf_counters.in_errors()), - ] - for counter, tags, value in counters: - self.printIntfCounter(counter, ts, value, intf_id, tags) - - if intf_id.intf_type() == eossdk.INTF_TYPE_ETH: - eth_intf_counters = self.eth_phy_intf_counter_mgr_.counters(intf_id) - eth_counters = [ - ("singleCollisionFrames", {}, - eth_intf_counters.single_collision_frames()), - ("multipleCollisionFrames", {}, - eth_intf_counters.multiple_collision_frames()), - ("fcsErrors", {}, - eth_intf_counters.fcs_errors()), - ("alignmentErrors", {}, - eth_intf_counters.alignment_errors()), - ("deferredTransmissions", {}, - eth_intf_counters.deferred_transmissions()), - ("lateCollisions", {}, - eth_intf_counters.late_collisions()), - ("excessiveCollisions", {}, - eth_intf_counters.excessive_collisions()), - ("internalMacTransmitErrors", {}, - eth_intf_counters.internal_mac_transmit_errors()), - ("carrierSenseErrors", {}, - eth_intf_counters.carrier_sense_errors()), - ("internalMacReceiveErrors", {}, - eth_intf_counters.internal_mac_receive_errors()), - ("frameTooShorts", {}, - eth_intf_counters.frame_too_shorts()), - ("sqe_testErrors", {}, - eth_intf_counters.sqe_test_errors()), - ("symbolErrors", {}, - eth_intf_counters.symbol_errors()), - ("unknownOpcodes", {"direction" : "in"}, - eth_intf_counters.in_unknown_opcodes()), - ("pauseFrames", {"direction" : "out"}, - eth_intf_counters.out_pause_frames()), - ("pauseFrames", {"direction" : "in"}, - eth_intf_counters.in_pause_frames()), - ("fragments", {}, - eth_intf_counters.fragments()), - ("jabbers", {}, - eth_intf_counters.jabbers()), - ] - for counter, tags, value in eth_counters: + intf_types = frozenset([eossdk.INTF_TYPE_ETH, + eossdk.INTF_TYPE_MANAGEMENT, + eossdk.INTF_TYPE_LAG]) + + def __init__(self, agent_mgr, timeout_mgr, intf_mgr, + intf_counter_mgr, eth_phy_intf_counter_mgr): + self.intf_mgr_ = intf_mgr + self.intf_counter_mgr_ = intf_counter_mgr + self.eth_phy_intf_counter_mgr_ = eth_phy_intf_counter_mgr + self.interval_ = 30 + eossdk.AgentHandler.__init__(self, agent_mgr) + eossdk.TimeoutHandler.__init__(self, timeout_mgr) + + def on_initialized(self): + # Schedule ourselves to run immediately + self.timeout_time_is(eossdk.now()) + + def on_timeout(self): + for intf_id in self.intf_mgr_.intf_iter(): + if intf_id.intf_type() in self.intf_types: + self.printIntfCounters(intf_id) + sys.stdout.flush() + self.timeout_time_is(eossdk.now() + self.interval_) + + def printIntfCounters(self, intf_id): + ts = int(time.time()) + + self.intf_counter_mgr_.counters(intf_id) + intf_counters = self.intf_counter_mgr_.counters(intf_id) + counters = [ + ("ucastPkts", {"direction": "out"}, + intf_counters.out_ucast_pkts()), + ("multicastPkts", {"direction": "out"}, + intf_counters.out_multicast_pkts()), + ("broadcastPkts", {"direction": "out"}, + intf_counters.out_broadcast_pkts()), + ("ucastPkts", {"direction": "in"}, + intf_counters.in_ucast_pkts()), + ("multicastPkts", {"direction": "in"}, + intf_counters.in_multicast_pkts()), + ("broadcastPkts", {"direction": "in"}, + intf_counters.in_broadcast_pkts()), + ("octets", {"direction": "out"}, + intf_counters.out_octets()), + ("octets", {"direction": "in"}, + intf_counters.in_octets()), + ("discards", {"direction": "out"}, + intf_counters.out_discards()), + ("errors", {"direction": "out"}, + intf_counters.out_errors()), + ("discards", {"direction": "in"}, + intf_counters.in_discards()), + ("errors", {"direction": "in"}, + intf_counters.in_errors()), + ] + for counter, tags, value in counters: self.printIntfCounter(counter, ts, value, intf_id, tags) - eth_intf_bin_counters = self.eth_phy_intf_counter_mgr_.bin_counters(intf_id) - eth_bin_counters = [ - ("frameBySize", {"size" : "64", "direction" : "in"}, - eth_intf_bin_counters.in_64_octet_frames()), - ("frameBySize", {"size" : "65To127", "direction" : "in"}, - eth_intf_bin_counters.in_65_to_127_octet_frames()), - ("frameBySize", {"size" : "128To255", "direction" : "in"}, - eth_intf_bin_counters.in_128_to_255_octet_frames()), - ("frameBySize", {"size" : "256To511", "direction" : "in"}, - eth_intf_bin_counters.in_256_to_511_octet_frames()), - ("frameBySize", {"size" : "512To1023", "direction" : "in"}, - eth_intf_bin_counters.in_512_to_1023_octet_frames()), - ("frameBySize", {"size" : "1024To1522", "direction" : "in"}, - eth_intf_bin_counters.in_1024_to_1522_octet_frames()), - ("frameBySize", {"size" : "1523ToMax", "direction" : "in"}, - eth_intf_bin_counters.in_1523_to_max_octet_frames()), - ("frameBySize", {"size" : "64", "direction" : "out"}, - eth_intf_bin_counters.out_64_octet_frames()), - ("frameBySize", {"size" : "65To127", "direction" : "out"}, - eth_intf_bin_counters.out_65_to_127_octet_frames()), - ("frameBySize", {"size" : "128To255", "direction" : "out"}, - eth_intf_bin_counters.out_128_to_255_octet_frames()), - ("frameBySize", {"size" : "256To511", "direction" : "out"}, - eth_intf_bin_counters.out_256_to_511_octet_frames()), - ("frameBySize", {"size" : "512To1023", "direction" : "out"}, - eth_intf_bin_counters.out_512_to_1023_octet_frames()), - ("frameBySize", {"size" : "1024To1522", "direction" : "out"}, - eth_intf_bin_counters.out_1024_to_1522_octet_frames()), - ("frameBySize", {"size" : "1523ToMax", "direction" : "out"}, - eth_intf_bin_counters.out_1523_to_max_octet_frames()), + if intf_id.intf_type() == eossdk.INTF_TYPE_ETH: + eth_intf_counters = self.eth_phy_intf_counter_mgr_.counters(intf_id) + eth_counters = [ + ("singleCollisionFrames", {}, + eth_intf_counters.single_collision_frames()), + ("multipleCollisionFrames", {}, + eth_intf_counters.multiple_collision_frames()), + ("fcsErrors", {}, + eth_intf_counters.fcs_errors()), + ("alignmentErrors", {}, + eth_intf_counters.alignment_errors()), + ("deferredTransmissions", {}, + eth_intf_counters.deferred_transmissions()), + ("lateCollisions", {}, + eth_intf_counters.late_collisions()), + ("excessiveCollisions", {}, + eth_intf_counters.excessive_collisions()), + ("internalMacTransmitErrors", {}, + eth_intf_counters.internal_mac_transmit_errors()), + ("carrierSenseErrors", {}, + eth_intf_counters.carrier_sense_errors()), + ("internalMacReceiveErrors", {}, + eth_intf_counters.internal_mac_receive_errors()), + ("frameTooShorts", {}, + eth_intf_counters.frame_too_shorts()), + ("sqe_testErrors", {}, + eth_intf_counters.sqe_test_errors()), + ("symbolErrors", {}, + eth_intf_counters.symbol_errors()), + ("unknownOpcodes", {"direction": "in"}, + eth_intf_counters.in_unknown_opcodes()), + ("pauseFrames", {"direction": "out"}, + eth_intf_counters.out_pause_frames()), + ("pauseFrames", {"direction": "in"}, + eth_intf_counters.in_pause_frames()), + ("fragments", {}, + eth_intf_counters.fragments()), + ("jabbers", {}, + eth_intf_counters.jabbers()), ] - for counter, tags, value in eth_bin_counters: - self.printIntfCounter(counter, ts, value, intf_id, tags) + for counter, tags, value in eth_counters: + self.printIntfCounter(counter, ts, value, intf_id, tags) + + eth_intf_bin_counters = self.eth_phy_intf_counter_mgr_.bin_counters(intf_id) + eth_bin_counters = [ + ("frameBySize", {"size": "64", "direction": "in"}, + eth_intf_bin_counters.in_64_octet_frames()), + ("frameBySize", {"size": "65To127", "direction": "in"}, + eth_intf_bin_counters.in_65_to_127_octet_frames()), + ("frameBySize", {"size": "128To255", "direction": "in"}, + eth_intf_bin_counters.in_128_to_255_octet_frames()), + ("frameBySize", {"size": "256To511", "direction": "in"}, + eth_intf_bin_counters.in_256_to_511_octet_frames()), + ("frameBySize", {"size": "512To1023", "direction": "in"}, + eth_intf_bin_counters.in_512_to_1023_octet_frames()), + ("frameBySize", {"size": "1024To1522", "direction": "in"}, + eth_intf_bin_counters.in_1024_to_1522_octet_frames()), + ("frameBySize", {"size": "1523ToMax", "direction": "in"}, + eth_intf_bin_counters.in_1523_to_max_octet_frames()), + ("frameBySize", {"size": "64", "direction": "out"}, + eth_intf_bin_counters.out_64_octet_frames()), + ("frameBySize", {"size": "65To127", "direction": "out"}, + eth_intf_bin_counters.out_65_to_127_octet_frames()), + ("frameBySize", {"size": "128To255", "direction": "out"}, + eth_intf_bin_counters.out_128_to_255_octet_frames()), + ("frameBySize", {"size": "256To511", "direction": "out"}, + eth_intf_bin_counters.out_256_to_511_octet_frames()), + ("frameBySize", {"size": "512To1023", "direction": "out"}, + eth_intf_bin_counters.out_512_to_1023_octet_frames()), + ("frameBySize", {"size": "1024To1522", "direction": "out"}, + eth_intf_bin_counters.out_1024_to_1522_octet_frames()), + ("frameBySize", {"size": "1523ToMax", "direction": "out"}, + eth_intf_bin_counters.out_1523_to_max_octet_frames()), + ] + for counter, tags, value in eth_bin_counters: + self.printIntfCounter(counter, ts, value, intf_id, tags) - def printIntfCounter(self, counter, ts, value, intf_id, tags): - tag_str = " ".join(["%s=%s" % (tag_name, tag_value) for - (tag_name, tag_value) in tags.items()]) - sys.stdout.write("eos.interface.%s %d %d iface=%s %s\n" - % (counter, ts, value, intf_id.to_string(), tag_str)) + def printIntfCounter(self, counter, ts, value, intf_id, tags): + tag_str = " ".join(["%s=%s" % (tag_name, tag_value) for + (tag_name, tag_value) in tags.items()]) + sys.stdout.write("eos.interface.%s %d %d iface=%s %s\n" + % (counter, ts, value, intf_id.to_string(), tag_str)) def main(): - if eossdk == None: - # This collector requires the eossdk module - return 13 # Ask tcollector to not respawn us + if eossdk == None: + # This collector requires the eossdk module + return 13 # Ask tcollector to not respawn us - sdk = eossdk.Sdk("tcollector-eos") + sdk = eossdk.Sdk("tcollector-eos") - # Create the state managers we're going to poll. For now, - # we're just pulling information on interface counters - agent_mgr = sdk.get_agent_mgr() - intf_mgr = sdk.get_intf_mgr() - intf_counter_mgr = sdk.get_intf_counter_mgr() - eth_phy_intf_counter_mgr = sdk.get_eth_phy_intf_counter_mgr() - timeout_mgr = sdk.get_timeout_mgr() + # Create the state managers we're going to poll. For now, + # we're just pulling information on interface counters + agent_mgr = sdk.get_agent_mgr() + intf_mgr = sdk.get_intf_mgr() + intf_counter_mgr = sdk.get_intf_counter_mgr() + eth_phy_intf_counter_mgr = sdk.get_eth_phy_intf_counter_mgr() + timeout_mgr = sdk.get_timeout_mgr() - # Create a periodic interface counter collector - _ = IntfCounterCollector(agent_mgr, - timeout_mgr, - intf_mgr, - intf_counter_mgr, - eth_phy_intf_counter_mgr) + # Create a periodic interface counter collector + _ = IntfCounterCollector(agent_mgr, + timeout_mgr, + intf_mgr, + intf_counter_mgr, + eth_phy_intf_counter_mgr) - # Start the main loop - sdk.main_loop(sys.argv) + # Start the main loop + sdk.main_loop(sys.argv) if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/eos/tcollector_agent.py b/eos/tcollector_agent.py index 9ed31760..0eb3355e 100644 --- a/eos/tcollector_agent.py +++ b/eos/tcollector_agent.py @@ -30,294 +30,296 @@ class SdkLogger(object): - """Pretends to be a logging.Logger but logs using EOS SDK. - - Note that this only implements a subset of the logging.Logger API. - """ - # We do format string expansion in Python to work around BUG116830. - - def __init__(self, name): - self.tracer = eossdk.Tracer(name) - - def debug(self, msg, *args): - if self.tracer.enabled(eossdk.Level8): - self.tracer.trace8("DEBUG: " + msg % args) - - def info(self, msg, *args): - if self.tracer.enabled(eossdk.Level5): - self.tracer.trace5("INFO: " + msg % args) - - def warning(self, msg, *args): - if self.tracer.enabled(eossdk.Level2): - self.tracer.trace2("WARNING: " + msg % args) - - def error(self, msg, *args): - if self.tracer.enabled(eossdk.Level1): - self.tracer.trace1("ERROR: " + msg % args) - - def exception(self, msg, *args): - if self.tracer.enabled(eossdk.Level1): - self.tracer.trace1("ERROR: " + msg % args + traceback.format_exc()) - - def fatal(self, msg, *args): - self.tracer.enabled_is(eossdk.Level1, True) - msg %= args - self.tracer.trace1(msg) - assert False, msg - - def setLevel(self, level): - self.tracer.enabled_is(eossdk.Level8, level <= logging.DEBUG) - self.tracer.enabled_is(eossdk.Level5, level <= logging.INFO) - self.tracer.enabled_is(eossdk.Level2, level <= logging.WARNING) - self.tracer.enabled_is(eossdk.Level1, level <= logging.ERROR) - - @property - def level(self): - # TODO: There's currently no API to ask the Tracer what level(s) are enabled. - # tcollector currently only cares about whether or not debug logging is on. - if self.tracer.enabled(eossdk.Level8): - return logging.DEBUG - elif self.tracer.enabled(eossdk.Level5): - return logging.INFO - elif self.tracer.enabled(eossdk.Level2): - return logging.WARNING - elif self.tracer.enabled(eossdk.Level1): - return logging.ERROR - return logging.CRITICAL - - def addHandler(self, unused_handler): - pass - - def removeHandler(self, unused_handler): - pass + """Pretends to be a logging.Logger but logs using EOS SDK. + + Note that this only implements a subset of the logging.Logger API. + """ + + # We do format string expansion in Python to work around BUG116830. + + def __init__(self, name): + self.tracer = eossdk.Tracer(name) + + def debug(self, msg, *args): + if self.tracer.enabled(eossdk.Level8): + self.tracer.trace8("DEBUG: " + msg % args) + + def info(self, msg, *args): + if self.tracer.enabled(eossdk.Level5): + self.tracer.trace5("INFO: " + msg % args) + + def warning(self, msg, *args): + if self.tracer.enabled(eossdk.Level2): + self.tracer.trace2("WARNING: " + msg % args) + + def error(self, msg, *args): + if self.tracer.enabled(eossdk.Level1): + self.tracer.trace1("ERROR: " + msg % args) + + def exception(self, msg, *args): + if self.tracer.enabled(eossdk.Level1): + self.tracer.trace1("ERROR: " + msg % args + traceback.format_exc()) + + def fatal(self, msg, *args): + self.tracer.enabled_is(eossdk.Level1, True) + msg %= args + self.tracer.trace1(msg) + assert False, msg + + def setLevel(self, level): + self.tracer.enabled_is(eossdk.Level8, level <= logging.DEBUG) + self.tracer.enabled_is(eossdk.Level5, level <= logging.INFO) + self.tracer.enabled_is(eossdk.Level2, level <= logging.WARNING) + self.tracer.enabled_is(eossdk.Level1, level <= logging.ERROR) + + @property + def level(self): + # TODO: There's currently no API to ask the Tracer what level(s) are enabled. + # tcollector currently only cares about whether or not debug logging is on. + if self.tracer.enabled(eossdk.Level8): + return logging.DEBUG + elif self.tracer.enabled(eossdk.Level5): + return logging.INFO + elif self.tracer.enabled(eossdk.Level2): + return logging.WARNING + elif self.tracer.enabled(eossdk.Level1): + return logging.ERROR + return logging.CRITICAL + + def addHandler(self, unused_handler): + pass + + def removeHandler(self, unused_handler): + pass class TcollectorAgent(eossdk.AgentHandler, eossdk.SystemHandler, eossdk.TimeoutHandler): - def __init__(self, sdk): - eossdk.AgentHandler.__init__(self, sdk.get_agent_mgr()) - eossdk.TimeoutHandler.__init__(self, sdk.get_timeout_mgr()) - eossdk.SystemHandler.__init__(self, sdk.get_system_mgr()) - self.vrf_mgr_ = sdk.get_vrf_mgr() - - # Agent local status - self.tcollector_running_ = False - self.shutdown_in_progress_ = False - - self.reader_thread_ = None - self.sender_thread_ = None - self.main_thread_ = None - self.module_ = None - self.tags_ = None - debug("TcollectorAgent created") - - def on_initialized(self): - level = self.get_agent_mgr().agent_option("trace") - if level: - self._set_trace(level) - debug("Agent initialized.") - - # Set up initial status - self.get_agent_mgr().status_set("has_tcollector_py", "False") - - self.tags_ = { "host": self._get_hostname() } - # TODO add additional tags - - # Go through the agent startup process. - self.on_agent_enabled(self.get_agent_mgr().enabled()) - - def on_agent_enabled(self, enabled): - self._maybe_connect() - - def on_agent_option(self, name, value): - if name == "trace": - return self._set_trace(value) - # Options have changed. Attempt to (re)connect. - self._maybe_connect() - - def _set_trace(self, level): - level = { - "debug": logging.DEBUG, - "info": logging.INFO, - "warn": logging.WARNING, - "warning": logging.WARNING, - "error": logging.ERROR, - }.get(level.lower()) - if not level: - level = logging.INFO - self._import_tcollector() - self.module_.LOG.setLevel(level) - - def on_timeout(self): - """ Called when we've tried to shutdown the tcollector process - and need to wait for it to finish. Since we can't get notified - asynchronously, this is done out of a timer callback. """ - if self.shutdown_in_progress_: - # Not yet complete, check again in a second. - self.timeout_time_is(eossdk.now() + 1) - else: - # tcollector shutdown complete. Check to make sure - # we weren't re-enabled while shutting down. - self._maybe_connect() - - def _maybe_connect(self): - self._import_tcollector() - - if self.shutdown_in_progress_: - debug("tcollector is shutting down, will retry once complete") - return - - if not self._should_start(): - if self.tcollector_running_: - # First we have to stop the current tcollector - self.stop() - else: - debug("tcollector already stopped") - if not self.get_agent_mgr().enabled(): - # Agent has been disabled and tcollector is stopped. - # Declare cleanup complete - self.get_agent_mgr().agent_shutdown_complete_is(True) - else: - if not self.tcollector_running_: - self.start() - else: - debug("tcollector already running") - - def _should_start(self): - return (self.module_ is not None - and self.get_agent_mgr().enabled() - and self._get_tsd_host()) - - def _import_tcollector(self): - if self.module_ is not None: - return - try: - self.module_ = imp.load_source("tcollector", - TCOLLECTOR_PATH) - debug("Found tcollector.py") - self.get_agent_mgr().status_set("has_tcollector_py", "True") - self.module_.LOG = SdkLogger("tcollector") - self.module_.setup_logging() - except IOError, e: - import errno - if e.errno != errno.ENOENT: - raise - debug("No such file: tcollector.py") - - def _get_hostname(self): - hostname = self.get_system_mgr().hostname() - if not hostname or (hostname == "localhost"): - hostname = socket.gethostname() - return hostname - - def _get_tsd_host(self): - return self.get_agent_mgr().agent_option("tsd-host") - - def _get_tsd_port(self): - tsdPort = self.get_agent_mgr().agent_option("tsd-port") - if tsdPort and tsdPort.isdigit(): - return int(tsdPort) - else: - return DEFAULT_TSD_PORT - - def _socket_at(self, family, socktype, proto): - vrf = self.get_agent_mgr().agent_option("vrf") or "" - fd = self.vrf_mgr_.socket_at(family, socktype, proto, vrf) - return socket._socketobject(_sock=socket.fromfd(fd, family, socktype, proto)) - - def start(self): - tcollector = self.module_ - tcollector.ALIVE = True - args = [TCOLLECTOR_PATH, - "--host", self._get_tsd_host(), - "--port", str(self._get_tsd_port()), - "--collector-dir=/usr/local/tcollector/collectors"] - - if self.get_agent_mgr().agent_option("dedup-interval"): - args.append("--dedup-interval=%s" % - self.get_agent_mgr().agent_option("dedup-interval")) - - tcollector.socket.socket = self._socket_at - debug("Starting tcollector", args) - options, args = tcollector.parse_cmdline(args) - tcollector.setup_python_path(TCOLLECTOR_PATH) - self.tags_["host"] = self._get_hostname() - modules = tcollector.load_etc_dir(options, self.tags_) - - reader = tcollector.ReaderThread(options.dedupinterval, - options.evictinterval) - self.reader_thread_ = reader - reader.start() - debug("ReaderThread startup complete") - - # and setup the sender to start writing out to the tsd - hosts = [(options.host, options.port)] - reconnect_interval = 0 - kwargs = {} - if self.get_agent_mgr().agent_option("transport") == "http": - kwargs["http"] = True - elif self.get_agent_mgr().agent_option("transport") == "https": - kwargs["http"] = True - kwargs["ssl"] = True - if self.get_agent_mgr().agent_option("username"): - kwargs["http_username"] = self.get_agent_mgr().agent_option("username") - if self.get_agent_mgr().agent_option("password"): - kwargs["http_password"] = self.get_agent_mgr().agent_option("password") - sender = tcollector.SenderThread(reader, - options.dryrun, - hosts, - not options.no_tcollector_stats, - self.tags_, - reconnect_interval, - **kwargs) - self.sender_thread_ = sender - sender.start() - debug("SenderThread startup complete") - - self.main_thread_ = threading.Thread(target=self.module_.main_loop, - name="tcollector", - args=(options, modules, - sender, self.tags_)) - self.main_thread_.start() - debug("tcollector startup complete") - self.tcollector_running_ = True - - def stop(self): - assert not self.shutdown_in_progress_ - self.shutdown_in_progress_ = True - - debug("Telling tcollector to die") - self.module_.ALIVE = False - - def do_stop(): - debug("Joining main thread") - self.main_thread_.join() - debug("Joining ReaderThread thread") - self.reader_thread_.join() - debug("Joining SenderThread thread") - self.sender_thread_.join() - debug("Killing all remaining collectors") - for col in list(self.module_.all_living_collectors()): - col.shutdown() - # Unregister the collectors... - self.module_.COLLECTORS.clear() - debug("Shutdown complete, updating running status") - self.tcollector_running_ = False - # Notify that shutdown is complete - self.shutdown_in_progress_ = False - - # AFAIK we can't join the threads asynchronously, and each thread may - # take several seconds to join, join the threads with another thread... - # Kind of a kludge really. - threading.Thread(target=do_stop, name="stopTcollector").start() - - # Setup timeout handler to poll for stopTcollector thread completion - self.timeout_time_is(eossdk.now() + 1) + def __init__(self, sdk): + eossdk.AgentHandler.__init__(self, sdk.get_agent_mgr()) + eossdk.TimeoutHandler.__init__(self, sdk.get_timeout_mgr()) + eossdk.SystemHandler.__init__(self, sdk.get_system_mgr()) + self.vrf_mgr_ = sdk.get_vrf_mgr() + + # Agent local status + self.tcollector_running_ = False + self.shutdown_in_progress_ = False + + self.reader_thread_ = None + self.sender_thread_ = None + self.main_thread_ = None + self.module_ = None + self.tags_ = None + debug("TcollectorAgent created") + + def on_initialized(self): + level = self.get_agent_mgr().agent_option("trace") + if level: + self._set_trace(level) + debug("Agent initialized.") + + # Set up initial status + self.get_agent_mgr().status_set("has_tcollector_py", "False") + + self.tags_ = {"host": self._get_hostname()} + # TODO add additional tags + + # Go through the agent startup process. + self.on_agent_enabled(self.get_agent_mgr().enabled()) + + def on_agent_enabled(self, enabled): + self._maybe_connect() + + def on_agent_option(self, name, value): + if name == "trace": + return self._set_trace(value) + # Options have changed. Attempt to (re)connect. + self._maybe_connect() + + def _set_trace(self, level): + level = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warn": logging.WARNING, + "warning": logging.WARNING, + "error": logging.ERROR, + }.get(level.lower()) + if not level: + level = logging.INFO + self._import_tcollector() + self.module_.LOG.setLevel(level) + + def on_timeout(self): + """ Called when we've tried to shutdown the tcollector process + and need to wait for it to finish. Since we can't get notified + asynchronously, this is done out of a timer callback. """ + if self.shutdown_in_progress_: + # Not yet complete, check again in a second. + self.timeout_time_is(eossdk.now() + 1) + else: + # tcollector shutdown complete. Check to make sure + # we weren't re-enabled while shutting down. + self._maybe_connect() + + def _maybe_connect(self): + self._import_tcollector() + + if self.shutdown_in_progress_: + debug("tcollector is shutting down, will retry once complete") + return + + if not self._should_start(): + if self.tcollector_running_: + # First we have to stop the current tcollector + self.stop() + else: + debug("tcollector already stopped") + if not self.get_agent_mgr().enabled(): + # Agent has been disabled and tcollector is stopped. + # Declare cleanup complete + self.get_agent_mgr().agent_shutdown_complete_is(True) + else: + if not self.tcollector_running_: + self.start() + else: + debug("tcollector already running") + + def _should_start(self): + return (self.module_ is not None + and self.get_agent_mgr().enabled() + and self._get_tsd_host()) + + def _import_tcollector(self): + if self.module_ is not None: + return + try: + self.module_ = imp.load_source("tcollector", + TCOLLECTOR_PATH) + debug("Found tcollector.py") + self.get_agent_mgr().status_set("has_tcollector_py", "True") + self.module_.LOG = SdkLogger("tcollector") + self.module_.setup_logging() + except IOError as e: + import errno + if e.errno != errno.ENOENT: + raise + debug("No such file: tcollector.py") + + def _get_hostname(self): + hostname = self.get_system_mgr().hostname() + if not hostname or (hostname == "localhost"): + hostname = socket.gethostname() + return hostname + + def _get_tsd_host(self): + return self.get_agent_mgr().agent_option("tsd-host") + + def _get_tsd_port(self): + tsdPort = self.get_agent_mgr().agent_option("tsd-port") + if tsdPort and tsdPort.isdigit(): + return int(tsdPort) + else: + return DEFAULT_TSD_PORT + + def _socket_at(self, family, socktype, proto): + vrf = self.get_agent_mgr().agent_option("vrf") or "" + fd = self.vrf_mgr_.socket_at(family, socktype, proto, vrf) + return socket._socketobject(_sock=socket.fromfd(fd, family, socktype, proto)) + + def start(self): + tcollector = self.module_ + tcollector.ALIVE = True + args = [TCOLLECTOR_PATH, + "--host", self._get_tsd_host(), + "--port", str(self._get_tsd_port()), + "--collector-dir=/usr/local/tcollector/collectors"] + + if self.get_agent_mgr().agent_option("dedup-interval"): + args.append("--dedup-interval=%s" % + self.get_agent_mgr().agent_option("dedup-interval")) + + tcollector.socket.socket = self._socket_at + debug("Starting tcollector", args) + options, args = tcollector.parse_cmdline(args) + tcollector.setup_python_path(TCOLLECTOR_PATH) + self.tags_["host"] = self._get_hostname() + modules = tcollector.load_etc_dir(options, self.tags_) + + reader = tcollector.ReaderThread(options.dedupinterval, + options.evictinterval) + self.reader_thread_ = reader + reader.start() + debug("ReaderThread startup complete") + + # and setup the sender to start writing out to the tsd + hosts = [(options.host, options.port)] + reconnect_interval = 0 + kwargs = {} + if self.get_agent_mgr().agent_option("transport") == "http": + kwargs["http"] = True + elif self.get_agent_mgr().agent_option("transport") == "https": + kwargs["http"] = True + kwargs["ssl"] = True + if self.get_agent_mgr().agent_option("username"): + kwargs["http_username"] = self.get_agent_mgr().agent_option("username") + if self.get_agent_mgr().agent_option("password"): + kwargs["http_password"] = self.get_agent_mgr().agent_option("password") + sender = tcollector.SenderThread(reader, + options.dryrun, + hosts, + not options.no_tcollector_stats, + self.tags_, + reconnect_interval, + **kwargs) + self.sender_thread_ = sender + sender.start() + debug("SenderThread startup complete") + + self.main_thread_ = threading.Thread(target=self.module_.main_loop, + name="tcollector", + args=(options, modules, + sender, self.tags_)) + self.main_thread_.start() + debug("tcollector startup complete") + self.tcollector_running_ = True + + def stop(self): + assert not self.shutdown_in_progress_ + self.shutdown_in_progress_ = True + + debug("Telling tcollector to die") + self.module_.ALIVE = False + + def do_stop(): + debug("Joining main thread") + self.main_thread_.join() + debug("Joining ReaderThread thread") + self.reader_thread_.join() + debug("Joining SenderThread thread") + self.sender_thread_.join() + debug("Killing all remaining collectors") + for col in list(self.module_.all_living_collectors()): + col.shutdown() + # Unregister the collectors... + self.module_.COLLECTORS.clear() + debug("Shutdown complete, updating running status") + self.tcollector_running_ = False + # Notify that shutdown is complete + self.shutdown_in_progress_ = False + + # AFAIK we can't join the threads asynchronously, and each thread may + # take several seconds to join, join the threads with another thread... + # Kind of a kludge really. + threading.Thread(target=do_stop, name="stopTcollector").start() + + # Setup timeout handler to poll for stopTcollector thread completion + self.timeout_time_is(eossdk.now() + 1) + def main(): - sdk = eossdk.Sdk() - _ = TcollectorAgent(sdk) - debug("Starting agent") - sdk.main_loop(sys.argv) + sdk = eossdk.Sdk() + _ = TcollectorAgent(sdk) + debug("Starting agent") + sdk.main_loop(sys.argv) diff --git a/grok_scraper.py b/grok_scraper.py index eec961f5..05924f30 100755 --- a/grok_scraper.py +++ b/grok_scraper.py @@ -14,7 +14,7 @@ import yaml import traceback -from StringIO import StringIO +from io import StringIO from collectors.etc import grok_scraper_conf @@ -241,6 +241,7 @@ def munge_metric_name(metric): new_name = re.sub(r'\0', "_", new_name) return new_name + def main(): signal.signal(signal.SIGTERM, die) exporter_dir = grok_scraper_conf.get_grok_exporter_dir() @@ -272,7 +273,7 @@ def format_metric_value(value): def print_metric(metric_name, timestamp, value, tags): print("%s %s %s %s" % (munge_metric_name(metric_name), timestamp, format_metric_value(value), tags)) - for (url, patterns) in urls_vs_patterns.iteritems(): + for (url, patterns) in urls_vs_patterns.items(): try: response = requests.get(url) timestamp = int(time.time()) diff --git a/pylint-runner.py b/pylint-runner.py index 0c1e2add..8358186d 100755 --- a/pylint-runner.py +++ b/pylint-runner.py @@ -16,17 +16,22 @@ BASE_DIRECTORY = os.getcwd() SUMMARY = False + class WritableObject(object): "dummy output stream for pylint" + def __init__(self): self.content = [] + def write(self, st): "dummy write" self.content.append(st) + def read(self): "dummy read" return self.content + def run_pylint(filename, options): "run pylint on the given file" ARGS = ["--rcfile=./.pylintrc"] # put your own here @@ -35,20 +40,22 @@ def run_pylint(filename, options): pylint_output = WritableObject() from pylint import lint from pylint.reporters.text import TextReporter - lint.Run([filename]+ARGS, reporter=TextReporter(pylint_output), exit=False) + lint.Run([filename] + ARGS, reporter=TextReporter(pylint_output), exit=False) return pylint_output.read() + def print_line(line): global VERBOSE if VERBOSE: print(line.rstrip()) + def check(module, options): ''' apply pylint to the file specified if it is a *.py file ''' global total, count, errors - + if module[-3:] == ".py": args = '' @@ -59,22 +66,23 @@ def check(module, options): if re.match("E\:.*", line): errors += 1 if options.summary or options.verbose: - print "Module: %s - %s" % (module, line.rstrip()) + print("Module: %s - %s" % (module, line.rstrip())) if re.match("[RCWF]\:.*", line) and options.show_all: print_line(line) - if re.match("E....:.", line): + if re.match("E....:.", line): print_line(line) if "Your code has been rated at" in line: print_line(line) score = re.findall("\d.\d\d", line)[0] total += float(score) + def parse_cmdline(argv): """Parses the command-line.""" global BASE_DIRECTORY, VERBOSE, SUMMARY DEFAULT_BASE_DIR = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), - 'collectors') + 'collectors') parser = OptionParser(description='Runs pylint recursively on a directory') @@ -97,22 +105,24 @@ def parse_cmdline(argv): BASE_DIRECTORY = options.base_directory return (options, args) + def check_version(): - ver = sys.version_info - if ver[0] == 2 and ver[1] < 7: - sys.stderr.write("Requires Python >2.7 for pylint\n") - return False - return True + ver = sys.version_info + if ver[0] == 2 and ver[1] < 7: + sys.stderr.write("Requires Python >2.7 for pylint\n") + return False + return True + def main(argv): global BASE_DIRECTORY, VERBOSE if not check_version(): - return 0 + return 0 options, args = parse_cmdline(argv) - print_line("looking for *.py scripts in subdirectories of %s" % (BASE_DIRECTORY)) + print_line("looking for *.py scripts in subdirectories of %s" % (BASE_DIRECTORY)) for root, dirs, files in os.walk(BASE_DIRECTORY): for name in files: @@ -120,12 +130,13 @@ def main(argv): check(filepath, options) if options.summary: - print "==" * 50 - print "%d modules found" % count - print "%d errors found" % errors + print("==" * 50) + print("%d modules found" % count) + print("%d errors found" % errors) if options.show_all and count > 0: - print "AVERAGE SCORE = %.02f" % (total / count) + print("AVERAGE SCORE = %.02f" % (total / count)) return errors + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/tcollector.py b/tcollector.py index c306a75a..be469b4f 100755 --- a/tcollector.py +++ b/tcollector.py @@ -34,19 +34,25 @@ import threading import time import json -import urllib2 import base64 import zlib -from httplib import HTTPException + +try: + from urllib.request import urlopen, Request + from urllib.error import HTTPError + from http.client import HTTPException + from queue import Queue, Empty, Full +except ImportError: + from urllib2 import urlopen, Request + from urllib2 import HTTPError + from httplib import HTTPException + from Queue import Queue, Empty, Full + from logging.handlers import RotatingFileHandler -from Queue import Queue -from Queue import Empty -from Queue import Full from optparse import OptionParser, SUPPRESS_HELP from collectors.etc import yaml_conf - # global variables. COLLECTORS = {} GENERATION = 0 @@ -147,10 +153,11 @@ def read(self): if out: LOG.debug('reading %s got %d bytes on stderr', self.name, len(out)) + out = out.decode("utf-8") for line in out.splitlines(): LOG.warning('%s: %s', self.name, line) - except IOError, (err, msg): - if err != errno.EAGAIN: + except IOError as io_err: + if io_err.errno != errno.EAGAIN: raise except: LOG.exception('uncaught exception in stderr read') @@ -159,12 +166,16 @@ def read(self): # out a bunch of data points at one time and we get some weird sized # chunk. This read call is non-blocking. try: - self.buffer += self.proc.stdout.read() + data = self.proc.stdout.read() + if self.buffer is None: + self.buffer = data + elif data is not None: + self.buffer += data.decode("utf-8") if len(self.buffer): LOG.debug('reading %s, buffer now %d bytes', self.name, len(self.buffer)) - except IOError, (err, msg): - if err != errno.EAGAIN: + except IOError as io_err: + if io_err.errno != errno.EAGAIN: raise except AttributeError: # sometimes the process goes away in another thread and we don't @@ -226,7 +237,7 @@ def evict_old_keys(self, cut_off): cut_off: A UNIX timestamp. Any value that's older than this will be removed from the cache. """ - for key in self.values.keys(): + for key in list(self.values.keys()): time = self.values[key][3] if time < cut_off: del self.values[key] @@ -479,7 +490,7 @@ def pick_connection(self): # isn't in the blacklist, or until we run out of hosts (i.e. they # are all blacklisted, which typically happens when we lost our # connectivity to the outside world). - for self.current_tsd in xrange(self.current_tsd + 1, len(self.hosts)): + for self.current_tsd in range(self.current_tsd + 1, len(self.hosts)): hostport = self.hosts[self.current_tsd] if hostport not in self.blacklisted_hosts: break @@ -557,7 +568,7 @@ def run(self): if ALIVE: self.send_data() errors = 0 # We managed to do a successful iteration. - except (ArithmeticError, EOFError, EnvironmentError, HTTPException, LookupError, ValueError), e: + except (ArithmeticError, EOFError, EnvironmentError, HTTPException, LookupError, ValueError) as e: errors += 1 if errors > MAX_UNCAUGHT_EXCEPTIONS: LOG.exception('Too many uncaught exceptions in SenderThread (%s), going to exit', errors) @@ -590,7 +601,7 @@ def verify_conn(self): # closing the connection and indicating that we need to reconnect. try: self.tsd.close() - except socket.error, msg: + except socket.error as msg: pass # not handling that self.time_reconnect = time.time() return False @@ -600,7 +611,7 @@ def verify_conn(self): LOG.debug('verifying our TSD connection is alive') try: self.tsd.sendall('version\n') - except socket.error, msg: + except socket.error as msg: self.tsd = None self.blacklist_connection() return False @@ -612,7 +623,7 @@ def verify_conn(self): # connection try: buf = self.tsd.recv(bufsize) - except socket.error, msg: + except socket.error as msg: self.tsd = None self.blacklist_connection() return False @@ -666,9 +677,9 @@ def maintain_conn(self): addresses = socket.getaddrinfo(self.host, self.port, socket.AF_UNSPEC, socket.SOCK_STREAM, 0) - except socket.gaierror, e: + except socket.gaierror as e: # Don't croak on transient DNS resolution issues. - if e[0] in (socket.EAI_AGAIN, socket.EAI_NONAME, + if e.errno in (socket.EAI_AGAIN, socket.EAI_NONAME, socket.EAI_NODATA): LOG.debug('DNS resolution failure: %s: %s', self.host, e) continue @@ -681,7 +692,7 @@ def maintain_conn(self): # if we get here it connected LOG.debug('Connection to %s was successful' % (str(sockaddr))) break - except socket.error, msg: + except socket.error as msg: LOG.warning('Connection attempt failed to %s:%d: %s', self.host, self.port, msg) self.tsd.close() @@ -721,11 +732,11 @@ def send_data(self): # try sending again next time. try: if self.dryrun: - print out + print(out) else: self.tsd.sendall(out) self.sendq = [] - except socket.error, msg: + except socket.error as msg: LOG.error('failed to send data: %s', msg) try: self.tsd.close() @@ -766,17 +777,17 @@ def send_data_via_http(self): metric_tags[tag_key] = tag_value metric_entry = {} metric_entry["metric"] = metric - metric_entry["timestamp"] = long(timestamp) + metric_entry["timestamp"] = int(timestamp) metric_entry["value"] = float(value) metric_entry["tags"] = dict(self.tags).copy() if len(metric_tags) + len(metric_entry["tags"]) > self.maxtags: metric_tags_orig = set(metric_tags) subset_metric_keys = frozenset( metric_tags[:len(metric_tags[:self.maxtags - len(metric_entry["tags"])])]) - metric_tags = dict((k, v) for k, v in metric_tags.iteritems() if k in subset_metric_keys) + metric_tags = dict((k, v) for k, v in metric_tags.items() if k in subset_metric_keys) LOG.error("Exceeding maximum permitted metric tags - removing %s for metric %s", str(metric_tags_orig - set(metric_tags)), metric) - if "host" in metric_tags.keys() and metric_tags["host"] == "__..skip_tag..__": + if "host" in list(metric_tags.keys()) and metric_tags["host"] == "__..skip_tag..__": metric_tags.pop("host") if "host" in metric_entry["tags"]: metric_entry["tags"].pop("host", None) @@ -784,32 +795,33 @@ def send_data_via_http(self): metrics.append(metric_entry) if self.dryrun: - print "Would have sent:\n%s" % json.dumps(metrics, + print("Would have sent:\n%s" % json.dumps(metrics, sort_keys=True, - indent=4) + indent=4)) return - if ((self.current_tsd == -1) or (len(self.hosts) > 1)): + if (self.current_tsd == -1) or (len(self.hosts) > 1): self.pick_connection() url = self.build_http_url() LOG.debug("Sending metrics to %s", url) - req = urllib2.Request(url) + req = Request(url) if self.http_username and self.http_password: - req.add_header("Authorization", "Basic %s" - % base64.b64encode("%s:%s" % (self.http_username, self.http_password))) + auth_header = ("%s:%s" % (self.http_username, self.http_password)).encode("utf-8") + req.add_header("Authorization", "Basic %s" % base64.b64encode(auth_header).decode("ascii")) req.add_header("Content-Type", "application/json") body = json.dumps(metrics) - if self.compression_threshold > 0 and len(body) >= self.compression_threshold: + if 0 < self.compression_threshold <= len(body): req.add_header("Content-Encoding", "deflate") - body = zlib.compress(body) + body = zlib.compress(body.encode("utf-8")) try: LOG.debug("Writing body of %d lines / %d bytes" % (len(self.sendq), len(body))) - response = urllib2.urlopen(req, body) - LOG.debug("Received response %s %s", response.getcode(), response.read().rstrip('\n')) + response = urlopen(req, body) + data = response.read().decode("utf-8") + LOG.debug("Received response %s %s", response.getcode(), data.rstrip('\n')) # clear out the sendq self.sendq = [] # print "Got response code: %s" % response.getcode() @@ -817,8 +829,9 @@ def send_data_via_http(self): # for line in response: # print line, # print - except urllib2.HTTPError, e: - LOG.error("Got error %s %s while sending %d lines / %d bytes", e, e.read().rstrip('\n'), + except HTTPError as e: + data = e.read().decode("utf-8") + LOG.error("Got error %s %s while sending %d lines / %d bytes", e, data.rstrip('\n'), len(self.sendq), len(body)) if e.code == 401: LOG.error("Please check if your access_token is correct in /etc/xcollector/xcollector.yml") @@ -856,7 +869,7 @@ def tag_str_list_to_dict(list): def tag_dict_to_str_list(dict): tag_list = [] - for key, value in dict.iteritems(): + for key, value in dict.items(): tag_list.append(key.strip() + "=" + value.strip()) return tag_list @@ -968,7 +981,7 @@ def tag_dict_to_str_list(dict): help=SUPPRESS_HELP) # 'Username to use for HTTP Basic Auth when sending the data via HTTP' (options, args) = parser.parse_args(args=argv[1:]) cmdline_dict = tag_str_list_to_dict(options.tags) - for key, value in defaults['tags'].iteritems(): + for key, value in defaults['tags'].items(): cmdline_dict[key] = value options.tags = tag_dict_to_str_list(cmdline_dict) if options.dedupinterval < 0: @@ -989,7 +1002,7 @@ def daemonize(): if os.fork(): os._exit(0) os.chdir("/") - os.umask(022) + os.umask(0o22) os.setsid() os.umask(0) if os.fork(): @@ -1001,8 +1014,8 @@ def daemonize(): os.dup2(stdout.fileno(), 2) stdin.close() stdout.close() - os.umask(022) - for fd in xrange(3, 1024): + os.umask(0o22) + for fd in range(3, 1024): try: os.close(fd) except OSError: # This FD wasn't opened... @@ -1255,7 +1268,7 @@ def reload_changed_config_modules(modules, options, sender, tags): changed = False # Reload any module that has changed. - for path, (module, timestamp) in modules.iteritems(): + for path, (module, timestamp) in modules.items(): if path not in current_paths: # Module was removed. continue mtime = os.path.getmtime(path) @@ -1294,7 +1307,7 @@ def write_pid(pidfile): def all_collectors(): """Generator to return all collectors.""" - return COLLECTORS.itervalues() + return iter(COLLECTORS.values()) # collectors that are not marked dead @@ -1417,7 +1430,7 @@ def spawn_collector(col): stderr=subprocess.PIPE, close_fds=True, preexec_fn=os.setsid) - except OSError, e: + except OSError as e: LOG.error('Failed to spawn collector %s: %s' % (col.filename, e)) return # The following line needs to move below this line because it is used in diff --git a/tests.py b/tests.py index 645e605c..cbaf1076 100755 --- a/tests.py +++ b/tests.py @@ -42,7 +42,7 @@ def check_access_rights(top): pass collectors_path = os.path.dirname(os.path.abspath(__file__)) + \ - "/collectors/0" + "/collectors/0" check_access_rights(collectors_path) @@ -104,6 +104,7 @@ def test_doublePickTwoConnections(self): sender.pick_connection() self.assertEqual(tsd1, (sender.host, sender.port)) + class GrokScraperTests(unittest.TestCase): def test_munge_metric_name(self): @@ -119,7 +120,6 @@ def test_munge_metric_name(self): self.assertEqual(munge_metric_name("__tomcat_request_count"), "_tomcat.request.count") - class UDPCollectorTests(unittest.TestCase): def setUp(self): @@ -134,7 +134,7 @@ def setUp(self): sys.exit = lambda x: None try: - execfile(self.udp_bridge.filename, self.udp_globals) + exec(compile(open(self.udp_bridge.filename).read(), self.udp_bridge.filename, 'exec'), self.udp_globals) finally: sys.exit = self.saved_exit @@ -175,7 +175,7 @@ def test_single_line_no_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_single_line_put(self): @@ -189,7 +189,7 @@ def test_single_line_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_line_no_put(self): @@ -202,7 +202,7 @@ def test_multi_line_no_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_line_put(self): @@ -218,7 +218,7 @@ def test_multi_line_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_line_mixed_put(self): @@ -236,7 +236,7 @@ def test_multi_line_mixed_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_line_no_put_cond(self): @@ -248,7 +248,7 @@ def test_multi_line_no_put_cond(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_line_put_cond(self): @@ -263,7 +263,7 @@ def test_multi_line_put_cond(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_empty_line_no_put(self): @@ -277,7 +277,7 @@ def test_multi_empty_line_no_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, ['invalid data\n']) def test_multi_empty_line_put(self): @@ -291,7 +291,7 @@ def test_multi_empty_line_put(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, ['invalid data\n']) def test_multi_empty_line_no_put_cond(self): @@ -303,7 +303,7 @@ def test_multi_empty_line_no_put_cond(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) def test_multi_empty_line_put_cond(self): @@ -319,9 +319,10 @@ def test_multi_empty_line_put_cond(self): stdout = [] self.run_bridge_test(inputLines, stdout, stderr) - self.assertEquals(''.join(stdout), expected) + self.assertEqual(''.join(stdout), expected) self.assertListEqual(stderr, []) + if __name__ == '__main__': cdir = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'collectors') From 8e00f685041041e736170066c23b2154195b3c66 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Thu, 22 Feb 2018 16:52:00 +0530 Subject: [PATCH 2/8] Python 3 compatibility --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index cfcd9fd7..fea16e54 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,8 @@ before_script: - echo $PACKAGE_VERSION script: + - python -c "import sys; print('python version:%s' % sys.version);" + - python -c "import sys; print('python version:%s' % sys.version[0]);" - ./pylint-runner.py -s - ./tests.py - make -C rpm From 99b6455498c10452e98998e947f7caeb900c3e9b Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Thu, 22 Feb 2018 17:08:24 +0530 Subject: [PATCH 3/8] Python 3 compatibility --- tests.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests.py b/tests.py index cbaf1076..df697025 100755 --- a/tests.py +++ b/tests.py @@ -21,6 +21,7 @@ import tcollector from grok_scraper import munge_metric_name + class CollectorsTests(unittest.TestCase): def test_collectorsAccessRights(self): @@ -134,7 +135,7 @@ def setUp(self): sys.exit = lambda x: None try: - exec(compile(open(self.udp_bridge.filename).read(), self.udp_bridge.filename, 'exec'), self.udp_globals) + self.exec_script() finally: sys.exit = self.saved_exit @@ -143,6 +144,11 @@ def setUp(self): self.udp_globals['udp_bridge_conf'].enabled = lambda: True self.udp_globals['utils'] = mocks.Utils() + def exec_script(self): + with open(self.udp_bridge.filename) as file: + code = compile(file.read(), self.udp_bridge.filename, 'exec') + exec(code, self.udp_globals) + def run_bridge_test(self, udpInputLines, stdoutLines, stderrLines): mockSocket = self.udp_globals['socket'] = mocks.Socket() mockSocket.state['udp_in'] = list(udpInputLines) From 408d573e4f023d73c73a6f5a56e107a93a7cee12 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Thu, 22 Feb 2018 18:28:40 +0530 Subject: [PATCH 4/8] Python 3 compatibility --- collectors/0/elasticsearch.py | 5 ++--- collectors/0/flume.py | 5 ++--- collectors/0/graphite_bridge.py | 5 ++--- collectors/0/riak.py | 11 +++++++++-- collectors/300/aws_cloudwatch_stats.py | 5 ++--- collectors/lib/hadoop_http.py | 7 ++----- collectors/lib/utils.py | 5 ++--- 7 files changed, 21 insertions(+), 22 deletions(-) diff --git a/collectors/0/elasticsearch.py b/collectors/0/elasticsearch.py index cab8ef7d..bfc75279 100755 --- a/collectors/0/elasticsearch.py +++ b/collectors/0/elasticsearch.py @@ -22,10 +22,9 @@ import time import re -is_py2 = sys.version[0] == '2' -if is_py2: +try: import httplib as httplib -else: +except ImportError: import http.client as httplib try: diff --git a/collectors/0/flume.py b/collectors/0/flume.py index 93ef4e98..20badd0c 100755 --- a/collectors/0/flume.py +++ b/collectors/0/flume.py @@ -33,10 +33,9 @@ import sys import time -is_py2 = sys.version[0] == '2' -if is_py2: +try: import httplib as httplib -else: +except ImportError: import http.client as httplib try: diff --git a/collectors/0/graphite_bridge.py b/collectors/0/graphite_bridge.py index 8b244d46..11b7104b 100755 --- a/collectors/0/graphite_bridge.py +++ b/collectors/0/graphite_bridge.py @@ -16,10 +16,9 @@ import sys import threading -is_py2 = sys.version[0] == '2' -if is_py2: +try: import SocketServer as socketserver -else: +except ImportError: import socketserver as socketserver from collectors.lib import utils diff --git a/collectors/0/riak.py b/collectors/0/riak.py index e31814ca..61d36064 100755 --- a/collectors/0/riak.py +++ b/collectors/0/riak.py @@ -44,7 +44,14 @@ import os import sys import time -import urllib.request +try: + from urllib.request import urlopen + from urllib.error import HTTPError + from http.client import HTTPException +except ImportError: + from urllib2 import urlopen + from urllib2 import HTTPError + from httplib import HTTPException from collectors.lib import utils @@ -92,7 +99,7 @@ def print_stat(metric, value, tags=""): while True: ts = int(time.time()) - req = urllib.request.urlopen("http://localhost:8098/stats") + req = urlopen("http://localhost:8098/stats") if req is not None: obj = json.loads(req.read()) for key in obj: diff --git a/collectors/300/aws_cloudwatch_stats.py b/collectors/300/aws_cloudwatch_stats.py index a4a82bc6..c37611d4 100755 --- a/collectors/300/aws_cloudwatch_stats.py +++ b/collectors/300/aws_cloudwatch_stats.py @@ -8,10 +8,9 @@ import exceptions import threading -is_py2 = sys.version[0] == '2' -if is_py2: +try: import Queue as queue -else: +except ImportError: import queue as queue from time import mktime diff --git a/collectors/lib/hadoop_http.py b/collectors/lib/hadoop_http.py index 681324b1..3765ffd2 100644 --- a/collectors/lib/hadoop_http.py +++ b/collectors/lib/hadoop_http.py @@ -12,12 +12,9 @@ # of the GNU Lesser General Public License along with this program. If not, # see . -import sys - -is_py2 = sys.version[0] == '2' -if is_py2: +try: import httplib as httplib -else: +except ImportError: import http.client as httplib try: diff --git a/collectors/lib/utils.py b/collectors/lib/utils.py index 0260d6fd..d1296fb0 100644 --- a/collectors/lib/utils.py +++ b/collectors/lib/utils.py @@ -45,10 +45,9 @@ def is_sockfile(path): try: s = os.stat(path) except OSError as os_error: - (no, e) = os_error.args - if no == errno.ENOENT: + if os_error.errno == errno.ENOENT: return False - err("warning: couldn't stat(%r): %s" % (path, e)) + err("warning: couldn't stat(%r): %s" % (path, os_error.message)) return None return s.st_mode & stat.S_IFSOCK == stat.S_IFSOCK From 7aaa4220feb4f2de87b526ed5423e9352fc9b880 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Thu, 22 Feb 2018 18:50:47 +0530 Subject: [PATCH 5/8] Python 3 compatibility --- collectors/0/mountstats.py | 8 ++++---- collectors/300/aws_cloudwatch_stats.py | 6 +++--- collectors/lib/utils.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/collectors/0/mountstats.py b/collectors/0/mountstats.py index 5a5bfa7f..26899c31 100755 --- a/collectors/0/mountstats.py +++ b/collectors/0/mountstats.py @@ -79,11 +79,9 @@ # proc.mountstats.bytes.writepages 1464196613 2477054 nfshost=fls1.sys.lab1.syseng.tmcs nfsvol=/vol/vol0 """ -import os -import socket import sys import time -import md5 +import hashlib COLLECTION_INTERVAL = 10 # seconds @@ -143,7 +141,9 @@ def main(): # ( If multiple subdirectories of the same volume are mounted to different places they # will show up in mountstats, but will have duplicate data. ) if field == "events": - m = md5.new(line).digest() + digester = hashlib.md5() + digester.update(line) + m = digester.digest() rpc_metrics[device]['digest'] = m if m in rpc_metrics: # metrics already counted, mark as dupe ignore diff --git a/collectors/300/aws_cloudwatch_stats.py b/collectors/300/aws_cloudwatch_stats.py index c37611d4..d7a2d93f 100755 --- a/collectors/300/aws_cloudwatch_stats.py +++ b/collectors/300/aws_cloudwatch_stats.py @@ -139,7 +139,7 @@ def handle_region(region, statistic): except boto.exception.BotoServerError as e: # sys.stderr.write("finished region " + region + "," + statistic + "\n") pass - except exceptions.KeyboardInterrupt: + except KeyboardInterrupt: return 0 except: sys.stderr.write("failed region " + region + "," + statistic + "\n") @@ -166,7 +166,7 @@ def send_metrics(): for output in outputs: for t in output: print(t) - except exceptions.KeyboardInterrupt: + except KeyboardInterrupt: return 0 @@ -195,7 +195,7 @@ def main(): t.start() while threading.activeCount() > 1: time.sleep(1) - except exceptions.KeyboardInterrupt: + except KeyboardInterrupt: return 0 except: raise diff --git a/collectors/lib/utils.py b/collectors/lib/utils.py index d1296fb0..59a1ed16 100644 --- a/collectors/lib/utils.py +++ b/collectors/lib/utils.py @@ -47,7 +47,7 @@ def is_sockfile(path): except OSError as os_error: if os_error.errno == errno.ENOENT: return False - err("warning: couldn't stat(%r): %s" % (path, os_error.message)) + err("warning: couldn't stat(%r): %s" % (path, os_error.args[1])) return None return s.st_mode & stat.S_IFSOCK == stat.S_IFSOCK From e4b7f615f759609dfea67c95bf8de4bcb29a6ef5 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Fri, 23 Feb 2018 13:39:35 +0530 Subject: [PATCH 6/8] Python 3 compatibility --- collectors/0/procnettcp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collectors/0/procnettcp.py b/collectors/0/procnettcp.py index 42778d14..4d598e4b 100755 --- a/collectors/0/procnettcp.py +++ b/collectors/0/procnettcp.py @@ -222,9 +222,9 @@ def main(unused_args): key = ("state=%s endpoint=%s service=%s user=%s" % (TCPSTATES[state], endpoint, service, user)) if key in counter: - print("proc.net.tcp", ts, counter[key], key) + print("proc.net.tcp %d %s %s" % (ts, counter[key], key)) else: - print("proc.net.tcp", ts, "0", key) + print("proc.net.tcp %d %s %s" % (ts, "0", key)) sys.stdout.flush() time.sleep(interval) From a53c99643281d9ba2afe3eb670ea84ae4c4892f3 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Fri, 23 Feb 2018 18:28:32 +0530 Subject: [PATCH 7/8] Python 3 compatibility --- collectors/0/iostat.py | 2 +- collectors/0/memcache.py | 4 ++-- collectors/0/procnettcp.py | 2 +- collectors/300/aws_cloudwatch_stats.py | 2 +- grok_scraper.py | 3 ++- rpm/xcollector.spec | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/collectors/0/iostat.py b/collectors/0/iostat.py index fbfb4987..8577beb6 100755 --- a/collectors/0/iostat.py +++ b/collectors/0/iostat.py @@ -247,7 +247,7 @@ def main(): print("%s%s %d %s dev=%s" % (metric, FIELDS_PART[i], ts, values[i + 3], device)) else: - utils.err("Cannot parse /proc/diskstats line: ", line) + utils.err("Cannot parse /proc/diskstats line: %s" % line) continue sys.stdout.flush() diff --git a/collectors/0/memcache.py b/collectors/0/memcache.py index 002570a3..e56f1b1a 100755 --- a/collectors/0/memcache.py +++ b/collectors/0/memcache.py @@ -65,7 +65,7 @@ def find_memcached(): port = line.find(" -p ") if port < 0: - utils.err("Weird memcached process without a -p argument:", line) + utils.err("Weird memcached process without a -p argument: %s" % line) continue port = line[port + 4: line.index(" ", port + 5)] port = int(port) @@ -73,7 +73,7 @@ def find_memcached(): utils.err("Host and port: %s %d" % (host, port)) yield host, port else: - utils.err("Unknown memached port:", port) + utils.err("Unknown memached port: %s" % port) def collect_stats(sock): diff --git a/collectors/0/procnettcp.py b/collectors/0/procnettcp.py index 4d598e4b..69bf1c2d 100755 --- a/collectors/0/procnettcp.py +++ b/collectors/0/procnettcp.py @@ -145,7 +145,7 @@ def main(unused_args): try: # On some Linux kernel versions, with lots of connections os.nice(19) # this collector can be very CPU intensive. So be nicer. except OSError as e: - utils.err("warning: failed to self-renice:", e) + utils.err("warning: failed to self-renice: %s" % e) interval = 60 diff --git a/collectors/300/aws_cloudwatch_stats.py b/collectors/300/aws_cloudwatch_stats.py index d7a2d93f..e2976fff 100755 --- a/collectors/300/aws_cloudwatch_stats.py +++ b/collectors/300/aws_cloudwatch_stats.py @@ -64,7 +64,7 @@ def cloudwatch_connect_to_region(region): conn = boto.ec2.cloudwatch.connect_to_region(region, aws_access_key_id=access_key, aws_secret_access_key=secret_access_key) except: - print("Unexpected error:", sys.exc_info()[0]) + utils.err("Unexpected error: %s" % sys.exc_info()[0]) else: return conn diff --git a/grok_scraper.py b/grok_scraper.py index 05924f30..b467ef1e 100755 --- a/grok_scraper.py +++ b/grok_scraper.py @@ -17,6 +17,7 @@ from io import StringIO from collectors.etc import grok_scraper_conf +from collectors.lib import utils COLLECTION_INTERVAL_SECONDS = 15 MATCHING_FILE_POLLING_INTERVAL_SECONDS = 1 @@ -305,7 +306,7 @@ def print_metric(metric_name, timestamp, value, tags): else: print_metric(g[0], timestamp, g[2], tags) except: - print("Unexpected error:", sys.exc_info()[0]) + utils.err("Unexpected error: %s" % sys.exc_info()[0]) traceback.print_exc() die() diff --git a/rpm/xcollector.spec b/rpm/xcollector.spec index 6026e7e5..136aef3f 100644 --- a/rpm/xcollector.spec +++ b/rpm/xcollector.spec @@ -106,7 +106,7 @@ mkdir -p %{buildroot}/%{py2_sitelib}/ %{tcollectordir}/collectors/0/ifstat.py %{tcollectordir}/collectors/0/iostat.py %{tcollectordir}/collectors/0/netstat.py -#%{tcollectordir}/collectors/0/procnettcp.py +%{tcollectordir}/collectors/0/procnettcp.py %{tcollectordir}/collectors/0/procstats.py #%{tcollectordir}/collectors/0/smart_stats.py %{tcollectordir}/collectors/0/mysql.py From 4b71bfea70acc1790d3605af105ae3e292abc561 Mon Sep 17 00:00:00 2001 From: Rajiv Shivane Date: Thu, 1 Mar 2018 17:29:25 +0530 Subject: [PATCH 8/8] Support to collect metrics over UDP --- collectors/etc/udp_bridge_conf.py | 2 +- deb/Makefile | 2 ++ rpm/xcollector.spec | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/collectors/etc/udp_bridge_conf.py b/collectors/etc/udp_bridge_conf.py index 7ba1fb4b..82497b69 100644 --- a/collectors/etc/udp_bridge_conf.py +++ b/collectors/etc/udp_bridge_conf.py @@ -4,7 +4,7 @@ def enabled(): return True def flush_delay(): - return 60 + return 15 def usetcp(): return False diff --git a/deb/Makefile b/deb/Makefile index e72d98da..23c04697 100644 --- a/deb/Makefile +++ b/deb/Makefile @@ -45,6 +45,7 @@ deb: cp $(VPATH)/collectors/0/netstat.py build/usr/local/xcollector/collectors/0/ cp $(VPATH)/collectors/0/procnettcp.py build/usr/local/xcollector/collectors/0/ cp $(VPATH)/collectors/0/procstats.py build/usr/local/xcollector/collectors/0/ + cp $(VPATH)/collectors/0/udp_bridge.py build/usr/local/xcollector/collectors/0/ cp $(VPATH)/collectors/__init__.py build/usr/local/xcollector/collectors/ cp $(VPATH)/collectors/etc/__init__.py build/usr/local/xcollector/collectors/etc/ cp $(VPATH)/collectors/etc/config.py build/usr/local/xcollector/collectors/etc/ @@ -52,6 +53,7 @@ deb: cp $(VPATH)/collectors/etc/metric_naming.py build/usr/local/xcollector/collectors/etc/ cp $(VPATH)/collectors/etc/mysqlconf.py build/usr/local/xcollector/collectors/etc/ cp $(VPATH)/collectors/etc/yaml_conf.py build/usr/local/xcollector/collectors/etc/ + cp $(VPATH)/collectors/etc/udp_bridge_conf.py build/usr/local/xcollector/collectors/etc/ cp $(VPATH)/collectors/lib/__init__.py build/usr/local/xcollector/collectors/lib/ cp $(VPATH)/collectors/lib/utils.py build/usr/local/xcollector/collectors/lib/ cp $(VPATH)/conf/*.yml build/usr/local/xcollector/conf/ diff --git a/rpm/xcollector.spec b/rpm/xcollector.spec index 136aef3f..459b8926 100644 --- a/rpm/xcollector.spec +++ b/rpm/xcollector.spec @@ -92,6 +92,7 @@ mkdir -p %{buildroot}/%{py2_sitelib}/ #%{tcollectordir}/collectors/etc/postgresqlconf.py #%{tcollectordir}/collectors/etc/udp_bridge_conf.py #%{tcollectordir}/collectors/etc/zabbix_bridge_conf.py +%{tcollectordir}/collectors/etc/udp_bridge_conf.py %{tcollectordir}/conf/grok.yml %config %{tcollectordir}/conf/grok_nginx.yml %config %{tcollectordir}/conf/grok_tomcat.yml %config @@ -108,6 +109,7 @@ mkdir -p %{buildroot}/%{py2_sitelib}/ %{tcollectordir}/collectors/0/netstat.py %{tcollectordir}/collectors/0/procnettcp.py %{tcollectordir}/collectors/0/procstats.py +%{tcollectordir}/collectors/0/udp_bridge.py #%{tcollectordir}/collectors/0/smart_stats.py %{tcollectordir}/collectors/0/mysql.py %{tcollectordir}/collectors/0/memcache.py