forked from OleHolmNielsen/Slurm_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sbadjobs
executable file
·24 lines (19 loc) · 999 Bytes
/
sbadjobs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/env bash
# Print a warning about bad jobs hanging indefinitely in the queue.
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
# This script may be run regularly from crontab
# List of Job Reason Codes (see "man squeue") which we look for
REASONLIST="JobHeldAdmin|BadConstraints|InvalidAccount|InvalidQOS|JobLaunchFailure|NodeDown|PartitionDown|PartitionInactive|PartitionNodeLimit|PartitionTimeLimit|ReqNodeNotAvail|WaitingForScheduling|JobHoldMaxRequeue"
# Suppress printing of jobs with non job related reasons:
DONTPRINT="Reserved for maintenance"
# Configure the columns printed by squeue
# export SQUEUE_FORMAT="%.18i %.10P %.8j %.8u %.8a %.10T %.9Q %.10M %.9l %.6D %.6C %m %R"
export SQUEUE_FORMAT2="JobID:8,Partition:11,QOS:7,Name:10 ,UserName:9,Account:9,State:8,PriorityLong:9,ReasonList:16"
if [ -z "$DONTPRINT" ]
then
# echo "$SQUEUE_FORMAT2"
squeue | egrep "$REASONLIST"
else
# echo "$SQUEUE_FORMAT2"
squeue | egrep "$REASONLIST" | egrep -v "$DONTPRINT"
fi