Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A few fixes: #37

Merged
merged 1 commit into from
Mar 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion glusterhealth/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def main():
# In case of exception, log failure and also not_ok message
try:
func(context)
except:
except Exception:
output_error("Report failure", report=f)
logging.exception(lf("Report failure", report=f))

Expand Down
4 changes: 2 additions & 2 deletions glusterhealth/reports/disk_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
# later), or the GNU General Public License, version 2 (GPLv2), in all
# cases as published by the Free Software Foundation.

from .utils import get_disk_usage_details
from .utils import get_disk_usage_details, byteorstr


def check_disk_usage_percentage(ctx, path, percentage=0):
out = get_disk_usage_details(path, ctx)
if out is None:
return
if out.percentage:
used_percent = int(out.percentage.split('%')[0])
used_percent = int(out.percentage.split(byteorstr('%'))[0])
if used_percent >= percentage:
ctx.notok("Disk used percentage is exceeding threshold, "
"consider deleting unnecessary data",
Expand Down
12 changes: 6 additions & 6 deletions glusterhealth/reports/firewall-check.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,26 @@

def report_check_firewall_ports(ctx):
cmd = "netstat -npl | grep 24007 | grep -i glusterd"
cmd1 = "netstat -npl | grep -n /glusterfsd | grep tcp"
cmd1 = "netstat -npl | grep -n /glusterfsd | grep tcp"
try:
out = command_output(cmd)
out1 = command_output(cmd1)

if out:
ctx.ok("Ports open for glusterd:\n"+ out)
ctx.ok("Ports open for glusterd:\n" + out)
else:
ctx.warning("Unable to find ports for glusterd")

logging.warning("Firewall status of glusterd: \n" + out)

if out1:
ctx.ok("Ports open for glusterfsd:\n"+ out1)
ctx.ok("Ports open for glusterfsd:\n" + out1)
else:
ctx.warning("Unable to find ports for glusterfsd")

logging.warning("Firewall status of glusterfsd: \n" + out1)
except CommandError as e:
except CommandError as err:
ctx.notok("some error with firewall check")
logging.warn(ctx.lf("firewall check error",
error_code=e[0],
error=e[1]))
error_code=err.message[0],
error=err.message[2]))
35 changes: 23 additions & 12 deletions glusterhealth/reports/georep.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
import logging
import os

from .utils import process_log_file

from gluster.cli import volume, georep


GSYNCD_LOG_FILE = ("/var/log/glusterfs/geo-replication/gv1/"
"ssh%3A%2F%2Froot%40192.168.122.208%3Agluster"
"%3A%2F%2F127.0.0.1%3Agv2.log")

worker_restarts_data = {}


def get_georep_log_files():
logfiles = []
prefix = "/var/log/glusterfs/geo-replication"
for sessiondir in os.listdir(prefix):
p = os.path.join(prefix, sessiondir)
gsyncdlogfile = os.path.join(p, "gsyncd.log")
if os.path.isdir(p) and os.path.exists(gsyncdlogfile):
logfiles.append(gsyncdlogfile)

return logfiles


def filter_worker_restarts(line):
if "starting gsyncd worker" in line:
return True
Expand All @@ -34,14 +43,16 @@ def callback_worker_restarts(pline):


def report_check_worker_restarts(ctx):
process_log_file(GSYNCD_LOG_FILE, callback_worker_restarts,
filter_worker_restarts)
for k, v in worker_restarts_data.items():
if v <= 1:
ctx.ok("No Gsyncd worker restart", brick=k)
else:
ctx.warning("Gsyncd worker restarted more than once",
brick=k, num_restarts=v)
for logfile in get_georep_log_files():
geosession = os.path.basename(os.path.dirname(logfile))
process_log_file(logfile, callback_worker_restarts,
filter_worker_restarts)
for k, v in worker_restarts_data.items():
if v <= 1:
ctx.ok("No Gsyncd worker restart", brick=k, session=geosession)
else:
ctx.warning("Gsyncd worker restarted more than once",
brick=k, num_restarts=v, session=geosession)


def report_non_participating_bricks(ctx):
Expand Down
31 changes: 25 additions & 6 deletions glusterhealth/reports/gfid-mismatch-dht-report.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,41 @@
import logging
import os

from .utils import command_output, CommandError


logfile = "/var/log/glusterfs/mnt.log"
def get_mount_log_files():
prefix = "/var/log/glusterfs/"
logfiles = []
for f in os.listdir(prefix):
if f in ["glusterd.log", "cmd_history.log", "cli.log", "events.log"]:
continue

if f.startswith("gluster-health-report-"):
continue

def report_gfid__mismatch_dht(ctx):
if f.endswith(".log"):
logfiles.append(os.path.join(f))
return logfiles


def gfid__mismatch_dht(logfile, ctx):
cmd = "grep 'gfid differs' " + logfile + " | grep -v grep | wc -l"
try:
out = command_output(cmd)
if int(out.strip()) > 0:
ctx.error("gfid mismatch found",
no_of_mismatches=out.strip())
else:
ctx.ok("no gfid mismatch")
except CommandError as e:
ctx.ok("No gfid mismatch", logfile=os.path.basename(logfile))
except CommandError as err:
ctx.notok("Command failure")
logging.warn(ctx.lf("Command Failure",
error_code=e[0],
error=e[1]))
error_code=err.message[0],
error=err.message[2]))


def report_gfid__mismatch_dht(ctx):
logfiles = get_mount_log_files()
for logfile in logfiles:
gfid__mismatch_dht(logfile, ctx)
6 changes: 3 additions & 3 deletions glusterhealth/reports/glusterd-op-version.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import logging

from .utils import command_output, CommandError
from .utils import command_output, CommandError, strfrombytes


def report_check_glusterd_op_version(ctx):
Expand All @@ -20,8 +20,8 @@ def report_check_glusterd_op_version(ctx):
try:
out1 = command_output(cmd1)
out2 = command_output(cmd2)
version1 = out1.split()[-1]
version2 = out2.split()[-1]
version1 = strfrombytes(out1.split()[-1])
version2 = strfrombytes(out2.split()[-1])
if version1 != version2:
ctx.warning("op-version is not up to date")
else:
Expand Down
13 changes: 7 additions & 6 deletions glusterhealth/reports/glusterd-peer-disconnect.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from .utils import command_output, CommandError
from .utils import process_log_file
from .utils import process_log_file, byteorstr

num_errors = 0

Expand All @@ -25,8 +25,9 @@ def report_peer_disconnect(ctx):
cmd = "gluster peer status"
try:
out = command_output(cmd)
peer_count = out.split("\n")[0].split(":")[1].strip()
peer_conn_count = out.count("(Connected)")
peer_count = out.split(byteorstr("\n"))[0].split(
byteorstr(":"))[1].strip()
peer_conn_count = out.count(byteorstr("(Connected)"))
dis_count = int(peer_count) - int(peer_conn_count)
if 0 < dis_count:
ctx.notok("One or more peer/s in disconnected/rejected state",
Expand All @@ -37,8 +38,8 @@ def report_peer_disconnect(ctx):
ctx.ok("All peers are in connected state",
total_peer_count=int(peer_count),
connected_count=int(peer_conn_count))
except CommandError as e:
except CommandError as err:
ctx.notok("Failed to check peer status")
logging.warn(ctx.lf("peer status command failed",
error_code=e[0],
error=e[1]))
error_code=err.message[0],
error=err.message[2]))
2 changes: 1 addition & 1 deletion glusterhealth/reports/kernel_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def report_gluster_hung_task(ctx):
if task_name in gluster_binaries:
gldict[task_name] = gldict[task_name] + 1

for task_name, times in gldict.iteritems():
for task_name, times in gldict.items():
if times > 0:
ctx.error("Gluster process was hung/blocked for more than "
"120 seconds",
Expand Down
95 changes: 57 additions & 38 deletions glusterhealth/reports/memory_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,47 +11,66 @@

import logging

from utils import command_output, CommandError
from .utils import command_output, CommandError, byteorstr, strfrombytes

mem_used_limit = 90
gluster_mem_limit = 30



def report_system_memory_usage(ctx):
cmd = "free -m"
try:
out = command_output(cmd)
for line in out.split("\n"):
if "Mem" in line:
memtype, total, used, free, shared, cache, available = \
line.split()
elif "Swap" in line:
memtype, total, used, free = line.split()
else:
continue

percent = int(100 * float(used)/float(total))
if percent >= mem_used_limit:
ctx.notok("Memory used percentage on system is at alarming level", memtype=memtype.strip(':'), percentage=str(percent))
except CommandError as e:
ctx.notok("Failed to get memory usage")
logging.warn(ctx.lf("free command failed",
error_code=e[0],
error=e[1]))
cmd = "free -m"
try:
out = command_output(cmd)
for line in out.split(byteorstr("\n")):
if byteorstr("Mem") in line:
memtype, total, used, free, shared, cache, available = \
line.split()
elif byteorstr("Swap") in line:
memtype, total, used, free = line.split()
else:
continue

percent = int(100 * float(used)/float(total))
if percent >= mem_used_limit:
ctx.notok("Memory used percentage on system is at "
"alarming level",
memtype=memtype.strip(':'),
percentage=str(percent))
except CommandError as err:
ctx.notok("Failed to get memory usage")
logging.warn(ctx.lf("free command failed",
error_code=err.message[0],
error=err.message[2]))


def report_gluster_memory_usage(ctx):
cmd = ['pgrep','gluster']
try:
out = command_output(cmd)
for pid in out.strip().split("\n"):
mem_cmd = "ps -p {} -o %mem".format(pid)
mem_out = command_output(mem_cmd)
mem_percent = mem_out.split('\n')[1].split('.')[0].strip()
if int(mem_percent) >= gluster_mem_limit:
proc_cmd = "ps -p {} -o comm=".format(pid)
proc_out = command_output(proc_cmd)
ctx.notok("Memory used by gluster process is at alarming level", process_name=proc_out.strip(), percentage=mem_percent)
except CommandError as e:
ctx.notok("Failed to get memory usage of gluster processes")
logging.warn(ctx.lf("pgrep/ps command failed",
error_code=e[0],
error=e[1]))
cmd = ['pgrep', 'gluster']
out = byteorstr("")
try:
out = command_output(cmd)
except CommandError:
ctx.notok("No gluster process running")
return

try:
for pid in out.strip().split(byteorstr("\n")):
mem_cmd = "ps -p {} -o %mem=,comm=".format(strfrombytes(pid))
mem_out = command_output(mem_cmd)
data = mem_out.strip().split(byteorstr(" "))
mem_percent = strfrombytes(data[0].split(
byteorstr('.'))[0].strip())
proc_name = strfrombytes(data[1].strip())
if int(mem_percent) >= gluster_mem_limit:
ctx.notok("Memory used by gluster process is at "
"alarming level",
process_name=proc_name,
percentage=mem_percent)
else:
ctx.ok("Memory usage of gluster process",
process_name=proc_name,
percentage=mem_percent)
except CommandError as err:
ctx.notok("Failed to get memory usage of gluster processes")
logging.warn(ctx.lf("pgrep/ps command failed",
error_code=err.message[0],
error=err.message[2]))
10 changes: 8 additions & 2 deletions glusterhealth/reports/process_status.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
#!/bin/bash
. $(dirname $(readlink -e $0))/utils.sh

good=80

pid=`gluster v status| grep localhost|awk '/Self-heal/ {print $NF}'|uniq`
value=`ps -p $pid -o %cpu | grep -v CPU | cut -d'.' -f1`
echo $value

if [ "x$value" == "x" ] ; then
exit 0;
fi

if (( $(echo "$value $good" | awk '{print ($1 > $2)}') ))
then
echo "High CPU usage by Self-heal $value"
NOTOK "High CPU usage by Self-heal value=$value"
else
OK "CPU usage by Self-heal value=$value"
fi
Loading