From 46899ea759693d12bd030cfc6cb82900649e06f8 Mon Sep 17 00:00:00 2001 From: Michael Jennings Date: Thu, 2 Oct 2014 20:28:07 +0000 Subject: [PATCH] Some lightly modified patches for better integration/interaction with Grid Engine from Dave Love , current author and maintainer of the open source Son of Grid Engine project at the University of Liverpool (see https://arc.liv.ac.uk/trac/SGE for project info). This significantly improves compatibility with SoGE, UGE, OGS, and other derived works based on the original Sun Grid Engine. --- nhc | 88 +++++++++++++++++++++----------------------- scripts/common.nhc | 15 ++++++++ scripts/ww_ps.nhc | 7 +++- test/test_common.nhc | 19 +++++++++- 4 files changed, 81 insertions(+), 48 deletions(-) diff --git a/nhc b/nhc index d4296b1..562290c 100755 --- a/nhc +++ b/nhc @@ -38,10 +38,10 @@ function die() { if [[ -n "$NHC_DETACHED" ]]; then echo "$RET $*" > $RESULTFILE elif [[ "$NHC_RM" == "sge" ]]; then - echo "begin" - echo "$HOSTNAME:healthy:false" - echo "$HOSTNAME:diagnosis:NHC: $*" - echo "end" + echo "begin" >$NHC_FD_OUT + echo "$HOSTNAME:healthy:false" >$NHC_FD_OUT + echo "$HOSTNAME:diagnosis:NHC: $*" >$NHC_FD_OUT + echo "end" >$NHC_FD_OUT return 77 elif [[ -n "$LOGFILE" ]]; then oecho "ERROR: $NAME: Health check failed: $*" @@ -51,7 +51,7 @@ function die() { return 0 fi kill_watchdog - [[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4- + [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4- exit $RET } @@ -91,11 +91,7 @@ function oecho() { if [[ "$SILENT" == "0" ]]; then [[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - " - if [[ -n "$LOGFILE" ]]; then - echo "$PREFIX$@" >&3 - else - echo "$PREFIX$@" - fi + echo "$PREFIX$@" >&$NHC_FD_OUT fi } @@ -105,11 +101,7 @@ function eecho() { if [[ "$SILENT" == "0" ]]; then [[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - " - if [[ -n "$LOGFILE" ]]; then - echo "$PREFIX$@" >&4 - else - echo "$PREFIX$@" - fi + echo "$PREFIX$@" >&$NHC_FD_ERR fi } @@ -119,11 +111,7 @@ function vecho() { if [[ "$VERBOSE" == "1" ]]; then [[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - " - if [[ -n "$LOGFILE" ]]; then - echo "$PREFIX$@" >&3 - else - echo "$PREFIX$@" - fi + echo "$PREFIX$@" >&$NHC_FD_OUT fi } @@ -171,8 +159,10 @@ function nhcmain_init_env() { WATCHDOG_PID=0 FAIL_CNT=0 FORCE_SETSID=0 + NHC_FD_OUT=1 + NHC_FD_ERR=2 export PATH SYSCONFIGDIR LIBEXECDIR HOSTNAME HOSTNAME_S RET LOGGER_TEXT - export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID + export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID NHC_FD_OUT NHC_FD_ERR # Users may override this in /etc/sysconfig/nhc. NAME=${0/#*\/} @@ -294,12 +284,15 @@ function nhcmain_finalize_env() { DETACHED_MODE=${DETACHED_MODE:-0} DETACHED_MODE_FAIL_NODATA=${DETACHED_MODE_FAIL_NODATA:-0} TIMEOUT=${TIMEOUT:-10} - MAX_SYS_UID=${MAX_SYS_UID:-99} NHC_CHECK_ALL=${NHC_CHECK_ALL:-0} NHC_CHECK_FORKED=${NHC_CHECK_FORKED:-0} FORCE_SETSID=${FORCE_SETSID:-0} export NHC_SID=0 + # Set from system defaults if present. + [[ -z "$MAX_SYS_UID" ]] && nhc_common_get_max_sys_uid + MAX_SYS_UID=${MAX_SYS_UID:-99} + # Check for session leader. kill -s 0 -- -$NHC_PID >/dev/null 2>&1 if [[ $? -eq 0 ]]; then @@ -369,26 +362,27 @@ function nhcmain_find_rm() { if [[ -d /var/spool/torque ]]; then NHC_RM="pbs" return 0 + elif [[ -n "$SGE_ROOT" && -x "$SGE_ROOT/util/arch" ]]; then + # SGE binaries typically won't be on the path defined above in the + # load sensor environment, but SGE_ROOT will be there. + NHC_RM="sge" + fi + + # Search PATH for commands + if type -a -p -f -P pbsnodes >&/dev/null ; then + NHC_RM="pbs" + return 0 + elif type -a -p -f -P scontrol >&/dev/null ; then + NHC_RM="slurm" + return 0 + elif type -a -p -f -P badmin >&/dev/null ; then + NHC_RM="lsf" + return 0 + elif type -a -p -f -P qselect >&/dev/null ; then + NHC_RM="sge" + return 0 fi - IFS=':' - DIRLIST=( $PATH ) - IFS=$' \t\n' - for DIR in "${DIRLIST[@]}" ; do - if [[ -x "$DIR/pbsnodes" ]]; then - NHC_RM="pbs" - return 0 - elif [[ -x "$DIR/scontrol" ]]; then - NHC_RM="slurm" - return 0 - elif [[ -x "$DIR/badmin" ]]; then - NHC_RM="lsf" - return 0 - elif [[ -x "$DIR/qselect" ]]; then - NHC_RM="sge" - return 0 - fi - done if [[ -z "$NHC_RM" ]]; then log "Unable to detect resource manager." return 1 @@ -407,6 +401,8 @@ function nhcmain_redirect_output() { exit 1 else dbg "Output redirected per LOGFILE variable $LOGFILE" + NHC_FD_OUT=3 + NHC_FD_ERR=4 fi fi } @@ -506,7 +502,7 @@ function nhcmain_detach() { nhcmain_redirect_output ELAPSED=$((SECONDS-NHC_START_TS)) vlog "Node Health Check detached parent completed successfully (${ELAPSED}s)." - [[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4- + [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4- exit 0 } @@ -565,14 +561,14 @@ function nhcmain_finish() { ELAPSED=$((SECONDS-NHC_START_TS)) vlog "Node Health Check completed successfully (${ELAPSED}s)." if [[ "$NHC_RM" == "sge" ]]; then - echo "begin" - echo "$HOSTNAME:healthy:true" - echo "$HOSTNAME:diagnosis:HEALTHY" - echo "end" + echo "begin" >$NHC_FD_OUT + echo "$HOSTNAME:healthy:true" >$NHC_FD_OUT + echo "$HOSTNAME:diagnosis:HEALTHY" >$NHC_FD_OUT + echo "end" >$NHC_FD_OUT return 0 fi kill_watchdog - [[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4- + [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4- exit 0 } diff --git a/scripts/common.nhc b/scripts/common.nhc index cd5596d..5989169 100644 --- a/scripts/common.nhc +++ b/scripts/common.nhc @@ -7,6 +7,7 @@ # PASSWD_DATA_SRC="${PASSWD_DATA_SRC:-/etc/passwd}" +LOGIN_DEFS_SRC="${LOGIN_DEFS_SRC:-/etc/login.defs}" RANGE_MATCH_REGEXP1='^[-a-zA-Z0-9_]+[0-9]+[-\.a-zA-Z0-9]*$' RANGE_MATCH_REGEXP2='^([-a-zA-Z0-9_]+)\[([0-9]+)\-([0-9]+)\]([-\.a-zA-Z0-9]*)$' @@ -538,3 +539,17 @@ function nhc_cmd_with_timeout() { #exec 2>&3 3>&- return $RET } + +# Find system definition for UID range +function nhc_common_get_max_sys_uid() { + local LINE + + if [[ -e "$LOGIN_DEFS_SRC" ]]; then + while read LINE ; do + if [[ "${LINE##UID_MIN}" != "$LINE" ]]; then + MAX_SYS_UID="${LINE//[^0-9]}" + break + fi + done < "$LOGIN_DEFS_SRC" + fi +} diff --git a/scripts/ww_ps.nhc b/scripts/ww_ps.nhc index 6d185e2..7ef6da3 100644 --- a/scripts/ww_ps.nhc +++ b/scripts/ww_ps.nhc @@ -38,7 +38,12 @@ function nhc_ps_gather_data() { elif [[ "$NHC_RM" == "slurm" ]]; then RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bslurmstepd\b/}" elif [[ "$NHC_RM" == "sge" ]]; then - RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bsge_execd\b/}" + # If you limit this to execd, you lose when it's been restarted, + # and the shepherd is detached. Even if execd is safe because of + # system uids, it can spawn mail commands as the job owner, at + # least. (The shepherd process name is normally + # sge_shepherd-, but maybe not if you change shepherd_cmd.) + RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bsge_(execd|shepherd)\b/}" else dbg "Unsupported RM detected in ${FUNCNAME}(): \"$NHC_RM\"" fi diff --git a/test/test_common.nhc b/test/test_common.nhc index 8485b32..98f0ae9 100644 --- a/test/test_common.nhc +++ b/test/test_common.nhc @@ -3,7 +3,7 @@ # $Id$ # -plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && { +plan $((11+5+8+5+4+6+8+7+9+7)) "common.nhc" && { is "`type -t mcheck_regexp 2>&1`" 'function' 'mcheck_regexp() loaded properly' is "`type -t mcheck_range 2>&1`" 'function' 'mcheck_regexp() loaded properly' is "`type -t mcheck_glob 2>&1`" 'function' 'mcheck_glob() loaded properly' @@ -13,6 +13,8 @@ plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && { is "`type -t nhc_common_get_uid 2>&1`" 'function' 'nhc_common_get_uid() loaded properly' is "`type -t nhc_common_parse_size 2>&1`" 'function' 'nhc_common_parse_size() loaded properly' is "`type -t nhc_common_unparse_size 2>&1`" 'function' 'nhc_common_unparse_size() loaded properly' + is "`type -t nhc_common_get_unix_time 2>&1`" 'function' 'nhc_common_get_unix_time() loaded properly' + is "`type -t nhc_common_get_max_sys_uid 2>&1`" 'function' 'nhc_common_get_max_sys_uid() loaded properly' mcheck "This is a test." '/test/' is $? 0 "Basic regexp match via mcheck()" @@ -156,4 +158,19 @@ plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && { nhc_common_unparse_count $OSIZE NSIZE is "$NSIZE" 999 "nhc_common_unparse_count(): $OSIZE -> 999" + LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t\t\t 500") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 500 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 500" + LOGIN_DEFS_SRC=<(echo -e "UID_MIN 999") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 999 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 999" + LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t0\t") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 0" + LOGIN_DEFS_SRC=<(echo -e "GID_MIN\t\t\t 1234") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): Bad syntax" + LOGIN_DEFS_SRC=<(echo -e "2345") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): Ignore plain number" + LOGIN_DEFS_SRC=<(echo -e "stuff\nGID_MIN 1\nGID_MAX 4\nUID_MIN 3\nUID_MAX 7\nblah blah blah\n") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 3 "nhc_common_get_max_sys_uid(): Multiline input" + LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t\t\t 500") nhc_common_get_max_sys_uid + is "$MAX_SYS_UID" 500 "nhc_common_get_max_sys_uid(): Reset default" + } ; unplan