gluster · amarts · Mar 22, 2019 · Mar 19, 2019
diff --git a/glusterhealth/main.py b/glusterhealth/main.py
@@ -189,7 +189,7 @@ def main():
                 # In case of exception, log failure and also not_ok message
                 try:
                     func(context)
-                except:
+                except Exception:
                     output_error("Report failure", report=f)
                     logging.exception(lf("Report failure", report=f))
 

diff --git a/glusterhealth/reports/disk_usage.py b/glusterhealth/reports/disk_usage.py
@@ -9,15 +9,15 @@
 # later), or the GNU General Public License, version 2 (GPLv2), in all
 # cases as published by the Free Software Foundation.
 
-from .utils import get_disk_usage_details
+from .utils import get_disk_usage_details, byteorstr
 
 
 def check_disk_usage_percentage(ctx, path, percentage=0):
     out = get_disk_usage_details(path, ctx)
     if out is None:
         return
     if out.percentage:
-        used_percent = int(out.percentage.split('%')[0])
+        used_percent = int(out.percentage.split(byteorstr('%'))[0])
         if used_percent >= percentage:
             ctx.notok("Disk used percentage is exceeding threshold, "
                       "consider deleting unnecessary data",

diff --git a/glusterhealth/reports/firewall-check.py b/glusterhealth/reports/firewall-check.py
@@ -16,26 +16,26 @@
 
 def report_check_firewall_ports(ctx):
     cmd = "netstat -npl | grep 24007 | grep -i glusterd"
-    cmd1 = "netstat -npl | grep -n /glusterfsd | grep tcp" 
+    cmd1 = "netstat -npl | grep -n /glusterfsd | grep tcp"
     try:
         out = command_output(cmd)
         out1 = command_output(cmd1)
 
         if out:
-	    ctx.ok("Ports open for glusterd:\n"+ out)
+            ctx.ok("Ports open for glusterd:\n" + out)
         else:
             ctx.warning("Unable to find ports for glusterd")
 
         logging.warning("Firewall status of glusterd: \n" + out)
 
         if out1:
-            ctx.ok("Ports open for glusterfsd:\n"+ out1)
+            ctx.ok("Ports open for glusterfsd:\n" + out1)
         else:
             ctx.warning("Unable to find ports for glusterfsd")
 
         logging.warning("Firewall status of glusterfsd: \n" + out1)
-    except CommandError as e:
+    except CommandError as err:
         ctx.notok("some error with firewall check")
         logging.warn(ctx.lf("firewall check error",
-                            error_code=e[0],
-                            error=e[1]))                                  
+                            error_code=err.message[0],
+                            error=err.message[2]))
diff --git a/glusterhealth/reports/georep.py b/glusterhealth/reports/georep.py
@@ -1,17 +1,26 @@
 import logging
+import os
 
 from .utils import process_log_file
 
 from gluster.cli import volume, georep
 
 
-GSYNCD_LOG_FILE = ("/var/log/glusterfs/geo-replication/gv1/"
-                   "ssh%3A%2F%2Froot%40192.168.122.208%3Agluster"
-                   "%3A%2F%2F127.0.0.1%3Agv2.log")
-
 worker_restarts_data = {}
 
 
+def get_georep_log_files():
+    logfiles = []
+    prefix = "/var/log/glusterfs/geo-replication"
+    for sessiondir in os.listdir(prefix):
+        p = os.path.join(prefix, sessiondir)
+        gsyncdlogfile = os.path.join(p, "gsyncd.log")
+        if os.path.isdir(p) and os.path.exists(gsyncdlogfile):
+            logfiles.append(gsyncdlogfile)
+
+    return logfiles
+
+
 def filter_worker_restarts(line):
     if "starting gsyncd worker" in line:
         return True
@@ -34,14 +43,16 @@ def callback_worker_restarts(pline):
 
 
 def report_check_worker_restarts(ctx):
-    process_log_file(GSYNCD_LOG_FILE, callback_worker_restarts,
-                     filter_worker_restarts)
-    for k, v in worker_restarts_data.items():
-        if v <= 1:
-            ctx.ok("No Gsyncd worker restart", brick=k)
-        else:
-            ctx.warning("Gsyncd worker restarted more than once",
-                        brick=k, num_restarts=v)
+    for logfile in get_georep_log_files():
+        geosession = os.path.basename(os.path.dirname(logfile))
+        process_log_file(logfile, callback_worker_restarts,
+                         filter_worker_restarts)
+        for k, v in worker_restarts_data.items():
+            if v <= 1:
+                ctx.ok("No Gsyncd worker restart", brick=k, session=geosession)
+            else:
+                ctx.warning("Gsyncd worker restarted more than once",
+                            brick=k, num_restarts=v, session=geosession)
 
 
 def report_non_participating_bricks(ctx):

diff --git a/glusterhealth/reports/gfid-mismatch-dht-report.py b/glusterhealth/reports/gfid-mismatch-dht-report.py
@@ -1,22 +1,41 @@
 import logging
+import os
 
 from .utils import command_output, CommandError
 
 
-logfile = "/var/log/glusterfs/mnt.log"
+def get_mount_log_files():
+    prefix = "/var/log/glusterfs/"
+    logfiles = []
+    for f in os.listdir(prefix):
+        if f in ["glusterd.log", "cmd_history.log", "cli.log", "events.log"]:
+            continue
 
+        if f.startswith("gluster-health-report-"):
+            continue
 
-def report_gfid__mismatch_dht(ctx):
+        if f.endswith(".log"):
+            logfiles.append(os.path.join(f))
+    return logfiles
+
+
+def gfid__mismatch_dht(logfile, ctx):
     cmd = "grep 'gfid differs' " + logfile + " | grep -v grep | wc -l"
     try:
         out = command_output(cmd)
         if int(out.strip()) > 0:
             ctx.error("gfid mismatch found",
                       no_of_mismatches=out.strip())
         else:
-            ctx.ok("no gfid mismatch")
-    except CommandError as e:
+            ctx.ok("No gfid mismatch", logfile=os.path.basename(logfile))
+    except CommandError as err:
         ctx.notok("Command failure")
         logging.warn(ctx.lf("Command Failure",
-                            error_code=e[0],
-                            error=e[1]))
+                            error_code=err.message[0],
+                            error=err.message[2]))
+
+
+def report_gfid__mismatch_dht(ctx):
+    logfiles = get_mount_log_files()
+    for logfile in logfiles:
+        gfid__mismatch_dht(logfile, ctx)
diff --git a/glusterhealth/reports/glusterd-op-version.py b/glusterhealth/reports/glusterd-op-version.py
@@ -11,7 +11,7 @@
 
 import logging
 
-from .utils import command_output, CommandError
+from .utils import command_output, CommandError, strfrombytes
 
 
 def report_check_glusterd_op_version(ctx):
@@ -20,8 +20,8 @@ def report_check_glusterd_op_version(ctx):
     try:
         out1 = command_output(cmd1)
         out2 = command_output(cmd2)
-        version1 = out1.split()[-1]
-        version2 = out2.split()[-1]
+        version1 = strfrombytes(out1.split()[-1])
+        version2 = strfrombytes(out2.split()[-1])
         if version1 != version2:
             ctx.warning("op-version is not up to date")
         else:

diff --git a/glusterhealth/reports/glusterd-peer-disconnect.py b/glusterhealth/reports/glusterd-peer-disconnect.py
@@ -1,7 +1,7 @@
 import logging
 
 from .utils import command_output, CommandError
-from .utils import process_log_file
+from .utils import process_log_file, byteorstr
 
 num_errors = 0
 
@@ -25,8 +25,9 @@ def report_peer_disconnect(ctx):
     cmd = "gluster peer status"
     try:
         out = command_output(cmd)
-        peer_count = out.split("\n")[0].split(":")[1].strip()
-        peer_conn_count = out.count("(Connected)")
+        peer_count = out.split(byteorstr("\n"))[0].split(
+            byteorstr(":"))[1].strip()
+        peer_conn_count = out.count(byteorstr("(Connected)"))
         dis_count = int(peer_count) - int(peer_conn_count)
         if 0 < dis_count:
             ctx.notok("One or more peer/s in disconnected/rejected state",
@@ -37,8 +38,8 @@ def report_peer_disconnect(ctx):
             ctx.ok("All peers are in connected state",
                    total_peer_count=int(peer_count),
                    connected_count=int(peer_conn_count))
-    except CommandError as e:
+    except CommandError as err:
         ctx.notok("Failed to check peer status")
         logging.warn(ctx.lf("peer status command failed",
-                            error_code=e[0],
-                            error=e[1]))
+                            error_code=err.message[0],
+                            error=err.message[2]))
diff --git a/glusterhealth/reports/kernel_issues.py b/glusterhealth/reports/kernel_issues.py
@@ -29,7 +29,7 @@ def report_gluster_hung_task(ctx):
                 if task_name in gluster_binaries:
                     gldict[task_name] = gldict[task_name] + 1
 
-    for task_name, times in gldict.iteritems():
+    for task_name, times in gldict.items():
         if times > 0:
             ctx.error("Gluster process was hung/blocked for more than "
                       "120 seconds",

diff --git a/glusterhealth/reports/memory_usage.py b/glusterhealth/reports/memory_usage.py
@@ -11,47 +11,66 @@
 
 import logging
 
-from utils import command_output, CommandError
+from .utils import command_output, CommandError, byteorstr, strfrombytes
 
 mem_used_limit = 90
 gluster_mem_limit = 30
-
+
+
 def report_system_memory_usage(ctx):
-	cmd = "free -m"
-	try:
-		out = command_output(cmd)
-		for line in out.split("\n"):
-			if "Mem" in line:
-				memtype, total, used, free, shared, cache, available = \
-							line.split()
-	 		elif "Swap" in line:
-				memtype, total, used, free = line.split()
-			else:
-				continue
-
-			percent = int(100 * float(used)/float(total))
-			if percent >= mem_used_limit:
-				ctx.notok("Memory used percentage on system is at alarming level", memtype=memtype.strip(':'), percentage=str(percent))
-	except CommandError as e:
-		ctx.notok("Failed to get memory usage")
-		logging.warn(ctx.lf("free command failed",
-							error_code=e[0],
-							error=e[1]))
+    cmd = "free -m"
+    try:
+        out = command_output(cmd)
+        for line in out.split(byteorstr("\n")):
+            if byteorstr("Mem") in line:
+                memtype, total, used, free, shared, cache, available = \
+                            line.split()
+            elif byteorstr("Swap") in line:
+                memtype, total, used, free = line.split()
+            else:
+                continue
+
+            percent = int(100 * float(used)/float(total))
+            if percent >= mem_used_limit:
+                ctx.notok("Memory used percentage on system is at "
+                          "alarming level",
+                          memtype=memtype.strip(':'),
+                          percentage=str(percent))
+    except CommandError as err:
+        ctx.notok("Failed to get memory usage")
+        logging.warn(ctx.lf("free command failed",
+                            error_code=err.message[0],
+                            error=err.message[2]))
+
 
 def report_gluster_memory_usage(ctx):
-	cmd = ['pgrep','gluster']
-	try:
-		out = command_output(cmd)
-		for pid in out.strip().split("\n"):
-			mem_cmd = "ps -p {} -o %mem".format(pid)
-			mem_out = command_output(mem_cmd)
-			mem_percent = mem_out.split('\n')[1].split('.')[0].strip()
-			if int(mem_percent) >= gluster_mem_limit:
-				proc_cmd = "ps -p {} -o comm=".format(pid)
-				proc_out = command_output(proc_cmd)
-				ctx.notok("Memory used by gluster process is at alarming level", process_name=proc_out.strip(), percentage=mem_percent)
-	except CommandError as e:
-		ctx.notok("Failed to get memory usage of gluster processes")
-		logging.warn(ctx.lf("pgrep/ps command failed",
-							error_code=e[0],
-							error=e[1]))
+    cmd = ['pgrep', 'gluster']
+    out = byteorstr("")
+    try:
+        out = command_output(cmd)
+    except CommandError:
+        ctx.notok("No gluster process running")
+        return
+
+    try:
+        for pid in out.strip().split(byteorstr("\n")):
+            mem_cmd = "ps -p {} -o %mem=,comm=".format(strfrombytes(pid))
+            mem_out = command_output(mem_cmd)
+            data = mem_out.strip().split(byteorstr(" "))
+            mem_percent = strfrombytes(data[0].split(
+                byteorstr('.'))[0].strip())
+            proc_name = strfrombytes(data[1].strip())
+            if int(mem_percent) >= gluster_mem_limit:
+                ctx.notok("Memory used by gluster process is at "
+                          "alarming level",
+                          process_name=proc_name,
+                          percentage=mem_percent)
+            else:
+                ctx.ok("Memory usage of gluster process",
+                       process_name=proc_name,
+                       percentage=mem_percent)
+    except CommandError as err:
+        ctx.notok("Failed to get memory usage of gluster processes")
+        logging.warn(ctx.lf("pgrep/ps command failed",
+                            error_code=err.message[0],
+                            error=err.message[2]))
diff --git a/glusterhealth/reports/process_status.sh b/glusterhealth/reports/process_status.sh
@@ -1,12 +1,18 @@
 #!/bin/bash
+. $(dirname $(readlink -e $0))/utils.sh
 
 good=80
 
 pid=`gluster v status| grep localhost|awk '/Self-heal/ {print $NF}'|uniq`
 value=`ps -p $pid -o %cpu | grep -v CPU | cut -d'.' -f1` 
-echo $value
+
+if [ "x$value" == "x" ] ; then
+    exit 0;
+fi
 
 if (( $(echo "$value $good" | awk '{print ($1 > $2)}') ))
 then 
-    echo "High CPU usage by Self-heal $value"
+    NOTOK "High CPU usage by Self-heal value=$value"
+else
+    OK "CPU usage by Self-heal value=$value"
 fi