From 6e63eef0af3243b6e5e053eb9ac0a44eb020334d Mon Sep 17 00:00:00 2001 From: Aravinda VK Date: Tue, 19 Mar 2019 14:21:09 +0530 Subject: [PATCH] A few fixes: - Python3/Python2 bytes and str handling fix - Exception handling fixes - Report log path fixes(dht and georep) Signed-off-by: Aravinda VK --- glusterhealth/main.py | 2 +- glusterhealth/reports/disk_usage.py | 4 +- glusterhealth/reports/firewall-check.py | 12 +-- glusterhealth/reports/georep.py | 35 ++++--- .../reports/gfid-mismatch-dht-report.py | 31 ++++-- glusterhealth/reports/glusterd-op-version.py | 6 +- .../reports/glusterd-peer-disconnect.py | 13 +-- glusterhealth/reports/kernel_issues.py | 2 +- glusterhealth/reports/memory_usage.py | 95 +++++++++++-------- glusterhealth/reports/process_status.sh | 10 +- glusterhealth/reports/utils.py | 38 +++++--- setup.py | 6 +- 12 files changed, 162 insertions(+), 92 deletions(-) diff --git a/glusterhealth/main.py b/glusterhealth/main.py index 11db5ea..e35a613 100644 --- a/glusterhealth/main.py +++ b/glusterhealth/main.py @@ -189,7 +189,7 @@ def main(): # In case of exception, log failure and also not_ok message try: func(context) - except: + except Exception: output_error("Report failure", report=f) logging.exception(lf("Report failure", report=f)) diff --git a/glusterhealth/reports/disk_usage.py b/glusterhealth/reports/disk_usage.py index aba47e7..48d620e 100644 --- a/glusterhealth/reports/disk_usage.py +++ b/glusterhealth/reports/disk_usage.py @@ -9,7 +9,7 @@ # later), or the GNU General Public License, version 2 (GPLv2), in all # cases as published by the Free Software Foundation. -from .utils import get_disk_usage_details +from .utils import get_disk_usage_details, byteorstr def check_disk_usage_percentage(ctx, path, percentage=0): @@ -17,7 +17,7 @@ def check_disk_usage_percentage(ctx, path, percentage=0): if out is None: return if out.percentage: - used_percent = int(out.percentage.split('%')[0]) + used_percent = int(out.percentage.split(byteorstr('%'))[0]) if used_percent >= percentage: ctx.notok("Disk used percentage is exceeding threshold, " "consider deleting unnecessary data", diff --git a/glusterhealth/reports/firewall-check.py b/glusterhealth/reports/firewall-check.py index 6e620ff..598b9b4 100644 --- a/glusterhealth/reports/firewall-check.py +++ b/glusterhealth/reports/firewall-check.py @@ -16,26 +16,26 @@ def report_check_firewall_ports(ctx): cmd = "netstat -npl | grep 24007 | grep -i glusterd" - cmd1 = "netstat -npl | grep -n /glusterfsd | grep tcp" + cmd1 = "netstat -npl | grep -n /glusterfsd | grep tcp" try: out = command_output(cmd) out1 = command_output(cmd1) if out: - ctx.ok("Ports open for glusterd:\n"+ out) + ctx.ok("Ports open for glusterd:\n" + out) else: ctx.warning("Unable to find ports for glusterd") logging.warning("Firewall status of glusterd: \n" + out) if out1: - ctx.ok("Ports open for glusterfsd:\n"+ out1) + ctx.ok("Ports open for glusterfsd:\n" + out1) else: ctx.warning("Unable to find ports for glusterfsd") logging.warning("Firewall status of glusterfsd: \n" + out1) - except CommandError as e: + except CommandError as err: ctx.notok("some error with firewall check") logging.warn(ctx.lf("firewall check error", - error_code=e[0], - error=e[1])) + error_code=err.message[0], + error=err.message[2])) diff --git a/glusterhealth/reports/georep.py b/glusterhealth/reports/georep.py index 809d3ce..cfa79ba 100644 --- a/glusterhealth/reports/georep.py +++ b/glusterhealth/reports/georep.py @@ -1,17 +1,26 @@ import logging +import os from .utils import process_log_file from gluster.cli import volume, georep -GSYNCD_LOG_FILE = ("/var/log/glusterfs/geo-replication/gv1/" - "ssh%3A%2F%2Froot%40192.168.122.208%3Agluster" - "%3A%2F%2F127.0.0.1%3Agv2.log") - worker_restarts_data = {} +def get_georep_log_files(): + logfiles = [] + prefix = "/var/log/glusterfs/geo-replication" + for sessiondir in os.listdir(prefix): + p = os.path.join(prefix, sessiondir) + gsyncdlogfile = os.path.join(p, "gsyncd.log") + if os.path.isdir(p) and os.path.exists(gsyncdlogfile): + logfiles.append(gsyncdlogfile) + + return logfiles + + def filter_worker_restarts(line): if "starting gsyncd worker" in line: return True @@ -34,14 +43,16 @@ def callback_worker_restarts(pline): def report_check_worker_restarts(ctx): - process_log_file(GSYNCD_LOG_FILE, callback_worker_restarts, - filter_worker_restarts) - for k, v in worker_restarts_data.items(): - if v <= 1: - ctx.ok("No Gsyncd worker restart", brick=k) - else: - ctx.warning("Gsyncd worker restarted more than once", - brick=k, num_restarts=v) + for logfile in get_georep_log_files(): + geosession = os.path.basename(os.path.dirname(logfile)) + process_log_file(logfile, callback_worker_restarts, + filter_worker_restarts) + for k, v in worker_restarts_data.items(): + if v <= 1: + ctx.ok("No Gsyncd worker restart", brick=k, session=geosession) + else: + ctx.warning("Gsyncd worker restarted more than once", + brick=k, num_restarts=v, session=geosession) def report_non_participating_bricks(ctx): diff --git a/glusterhealth/reports/gfid-mismatch-dht-report.py b/glusterhealth/reports/gfid-mismatch-dht-report.py index cff17cb..fde5e0b 100644 --- a/glusterhealth/reports/gfid-mismatch-dht-report.py +++ b/glusterhealth/reports/gfid-mismatch-dht-report.py @@ -1,12 +1,25 @@ import logging +import os from .utils import command_output, CommandError -logfile = "/var/log/glusterfs/mnt.log" +def get_mount_log_files(): + prefix = "/var/log/glusterfs/" + logfiles = [] + for f in os.listdir(prefix): + if f in ["glusterd.log", "cmd_history.log", "cli.log", "events.log"]: + continue + if f.startswith("gluster-health-report-"): + continue -def report_gfid__mismatch_dht(ctx): + if f.endswith(".log"): + logfiles.append(os.path.join(f)) + return logfiles + + +def gfid__mismatch_dht(logfile, ctx): cmd = "grep 'gfid differs' " + logfile + " | grep -v grep | wc -l" try: out = command_output(cmd) @@ -14,9 +27,15 @@ def report_gfid__mismatch_dht(ctx): ctx.error("gfid mismatch found", no_of_mismatches=out.strip()) else: - ctx.ok("no gfid mismatch") - except CommandError as e: + ctx.ok("No gfid mismatch", logfile=os.path.basename(logfile)) + except CommandError as err: ctx.notok("Command failure") logging.warn(ctx.lf("Command Failure", - error_code=e[0], - error=e[1])) + error_code=err.message[0], + error=err.message[2])) + + +def report_gfid__mismatch_dht(ctx): + logfiles = get_mount_log_files() + for logfile in logfiles: + gfid__mismatch_dht(logfile, ctx) diff --git a/glusterhealth/reports/glusterd-op-version.py b/glusterhealth/reports/glusterd-op-version.py index 347b431..36f4e0e 100644 --- a/glusterhealth/reports/glusterd-op-version.py +++ b/glusterhealth/reports/glusterd-op-version.py @@ -11,7 +11,7 @@ import logging -from .utils import command_output, CommandError +from .utils import command_output, CommandError, strfrombytes def report_check_glusterd_op_version(ctx): @@ -20,8 +20,8 @@ def report_check_glusterd_op_version(ctx): try: out1 = command_output(cmd1) out2 = command_output(cmd2) - version1 = out1.split()[-1] - version2 = out2.split()[-1] + version1 = strfrombytes(out1.split()[-1]) + version2 = strfrombytes(out2.split()[-1]) if version1 != version2: ctx.warning("op-version is not up to date") else: diff --git a/glusterhealth/reports/glusterd-peer-disconnect.py b/glusterhealth/reports/glusterd-peer-disconnect.py index 11f36af..af0ec1a 100644 --- a/glusterhealth/reports/glusterd-peer-disconnect.py +++ b/glusterhealth/reports/glusterd-peer-disconnect.py @@ -1,7 +1,7 @@ import logging from .utils import command_output, CommandError -from .utils import process_log_file +from .utils import process_log_file, byteorstr num_errors = 0 @@ -25,8 +25,9 @@ def report_peer_disconnect(ctx): cmd = "gluster peer status" try: out = command_output(cmd) - peer_count = out.split("\n")[0].split(":")[1].strip() - peer_conn_count = out.count("(Connected)") + peer_count = out.split(byteorstr("\n"))[0].split( + byteorstr(":"))[1].strip() + peer_conn_count = out.count(byteorstr("(Connected)")) dis_count = int(peer_count) - int(peer_conn_count) if 0 < dis_count: ctx.notok("One or more peer/s in disconnected/rejected state", @@ -37,8 +38,8 @@ def report_peer_disconnect(ctx): ctx.ok("All peers are in connected state", total_peer_count=int(peer_count), connected_count=int(peer_conn_count)) - except CommandError as e: + except CommandError as err: ctx.notok("Failed to check peer status") logging.warn(ctx.lf("peer status command failed", - error_code=e[0], - error=e[1])) + error_code=err.message[0], + error=err.message[2])) diff --git a/glusterhealth/reports/kernel_issues.py b/glusterhealth/reports/kernel_issues.py index 42af133..18273b5 100755 --- a/glusterhealth/reports/kernel_issues.py +++ b/glusterhealth/reports/kernel_issues.py @@ -29,7 +29,7 @@ def report_gluster_hung_task(ctx): if task_name in gluster_binaries: gldict[task_name] = gldict[task_name] + 1 - for task_name, times in gldict.iteritems(): + for task_name, times in gldict.items(): if times > 0: ctx.error("Gluster process was hung/blocked for more than " "120 seconds", diff --git a/glusterhealth/reports/memory_usage.py b/glusterhealth/reports/memory_usage.py index 0e410c8..34a185a 100644 --- a/glusterhealth/reports/memory_usage.py +++ b/glusterhealth/reports/memory_usage.py @@ -11,47 +11,66 @@ import logging -from utils import command_output, CommandError +from .utils import command_output, CommandError, byteorstr, strfrombytes mem_used_limit = 90 gluster_mem_limit = 30 - + + def report_system_memory_usage(ctx): - cmd = "free -m" - try: - out = command_output(cmd) - for line in out.split("\n"): - if "Mem" in line: - memtype, total, used, free, shared, cache, available = \ - line.split() - elif "Swap" in line: - memtype, total, used, free = line.split() - else: - continue - - percent = int(100 * float(used)/float(total)) - if percent >= mem_used_limit: - ctx.notok("Memory used percentage on system is at alarming level", memtype=memtype.strip(':'), percentage=str(percent)) - except CommandError as e: - ctx.notok("Failed to get memory usage") - logging.warn(ctx.lf("free command failed", - error_code=e[0], - error=e[1])) + cmd = "free -m" + try: + out = command_output(cmd) + for line in out.split(byteorstr("\n")): + if byteorstr("Mem") in line: + memtype, total, used, free, shared, cache, available = \ + line.split() + elif byteorstr("Swap") in line: + memtype, total, used, free = line.split() + else: + continue + + percent = int(100 * float(used)/float(total)) + if percent >= mem_used_limit: + ctx.notok("Memory used percentage on system is at " + "alarming level", + memtype=memtype.strip(':'), + percentage=str(percent)) + except CommandError as err: + ctx.notok("Failed to get memory usage") + logging.warn(ctx.lf("free command failed", + error_code=err.message[0], + error=err.message[2])) + def report_gluster_memory_usage(ctx): - cmd = ['pgrep','gluster'] - try: - out = command_output(cmd) - for pid in out.strip().split("\n"): - mem_cmd = "ps -p {} -o %mem".format(pid) - mem_out = command_output(mem_cmd) - mem_percent = mem_out.split('\n')[1].split('.')[0].strip() - if int(mem_percent) >= gluster_mem_limit: - proc_cmd = "ps -p {} -o comm=".format(pid) - proc_out = command_output(proc_cmd) - ctx.notok("Memory used by gluster process is at alarming level", process_name=proc_out.strip(), percentage=mem_percent) - except CommandError as e: - ctx.notok("Failed to get memory usage of gluster processes") - logging.warn(ctx.lf("pgrep/ps command failed", - error_code=e[0], - error=e[1])) + cmd = ['pgrep', 'gluster'] + out = byteorstr("") + try: + out = command_output(cmd) + except CommandError: + ctx.notok("No gluster process running") + return + + try: + for pid in out.strip().split(byteorstr("\n")): + mem_cmd = "ps -p {} -o %mem=,comm=".format(strfrombytes(pid)) + mem_out = command_output(mem_cmd) + data = mem_out.strip().split(byteorstr(" ")) + mem_percent = strfrombytes(data[0].split( + byteorstr('.'))[0].strip()) + proc_name = strfrombytes(data[1].strip()) + if int(mem_percent) >= gluster_mem_limit: + ctx.notok("Memory used by gluster process is at " + "alarming level", + process_name=proc_name, + percentage=mem_percent) + else: + ctx.ok("Memory usage of gluster process", + process_name=proc_name, + percentage=mem_percent) + except CommandError as err: + ctx.notok("Failed to get memory usage of gluster processes") + logging.warn(ctx.lf("pgrep/ps command failed", + error_code=err.message[0], + error=err.message[2])) diff --git a/glusterhealth/reports/process_status.sh b/glusterhealth/reports/process_status.sh index 0099990..64c62cb 100755 --- a/glusterhealth/reports/process_status.sh +++ b/glusterhealth/reports/process_status.sh @@ -1,12 +1,18 @@ #!/bin/bash +. $(dirname $(readlink -e $0))/utils.sh good=80 pid=`gluster v status| grep localhost|awk '/Self-heal/ {print $NF}'|uniq` value=`ps -p $pid -o %cpu | grep -v CPU | cut -d'.' -f1` -echo $value + +if [ "x$value" == "x" ] ; then + exit 0; +fi if (( $(echo "$value $good" | awk '{print ($1 > $2)}') )) then - echo "High CPU usage by Self-heal $value" + NOTOK "High CPU usage by Self-heal value=$value" +else + OK "CPU usage by Self-heal value=$value" fi diff --git a/glusterhealth/reports/utils.py b/glusterhealth/reports/utils.py index e23bd83..e227549 100644 --- a/glusterhealth/reports/utils.py +++ b/glusterhealth/reports/utils.py @@ -12,10 +12,24 @@ import logging import re from subprocess import Popen, PIPE +import sys + + +def byteorstr(val): + if sys.version_info >= (3,): + return val.encode() + return val + + +def strfrombytes(val): + if sys.version_info >= (3,): + return val.decode() + return val class CommandError(Exception): - pass + def __init__(self, value): + self.message = value def command_output(cmd): @@ -23,19 +37,19 @@ def command_output(cmd): p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=shell) out, err = p.communicate() if p.returncode != 0: - raise CommandError(p.returncode, err.strip()) + raise CommandError((p.returncode, out.strip(), err.strip())) return out # [TS] LOG_LEVEL [MSGID: ] [FILE:LINE:FUNC] DOMAIN: MSG # MSGID is optional and MSG can be structured log format or can be normal msg -log_pattern = re.compile('\[([^\]]+)\]\s' - '([IEWTD])\s' - '(\[MSGID:\s([^\]]+)\]\s)?' - '\[([^\]]+)\]\s' - '([^:]+):\s' - '(.+)') +log_pattern = re.compile(r'\[([^\]]+)\]\s' + r'([IEWTD])\s' + r'(\[MSGID:\s([^\]]+)\]\s)?' + r'\[([^\]]+)\]\s' + r'([^:]+):\s' + r'(.+)') class ParsedData(object): @@ -127,12 +141,12 @@ def get_disk_usage_details(path, ctx): try: out = command_output(cmd) device, size, used, available, percentage, mountpoint = \ - out.split("\n")[1].split() + out.split(byteorstr("\n"))[1].split() return DiskUsage(device, size, used, available, percentage, mountpoint) - except CommandError as e: + except CommandError as err: logging.warning("Disk usage: \n" + out) logging.warn(ctx.lf("disk usage failed", - error_code=e[0], - error=e[1])) + error_code=err.message[0], + error=err.message[2])) return None diff --git a/setup.py b/setup.py index 6b7e1d9..f91c332 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ setup( name="gluster-health-report", - version="0.4", + version="0.7", packages=["glusterhealth", "glusterhealth.reports"], include_package_data=True, install_requires=["glustercli"], @@ -30,7 +30,7 @@ description="Gluster Health Report tools", license="GPLv2", keywords="gluster, tool, health", - url="https://github.com/aravindavk/gluster-health-report", + url="https://github.com/gluster/gluster-health-report", long_description=""" Gluster Health Report """, @@ -43,6 +43,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 2 :: Only" + "Programming Language :: Python :: 3", ], )