forked from OleHolmNielsen/Slurm_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
notifybadjob
executable file
·382 lines (327 loc) · 11.1 KB
/
notifybadjob
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env bash
# Notify about or Kill a badly behaving job and send information mail to the user.
# Author: Ole Holm Nielsen, [email protected]
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
### CONFIGURE these lines:
# Mail to the system managers
# Mail to the superuser
SUPERUSERMAIL=root@localhost
# Informative web page:
INFOURL=https://your.domain
### End CONFIGURE these lines:
# Command usage:
USAGE="Usage: $0 '[-k]' job-id"
# Name of this cluster
CLUSTERNAME=`scontrol show config | grep ClusterName | awk '{print $3}'`
# Location of our Slurm tools (see https://github.com/OleHolmNielsen/Slurm_tools/tree/master/)
PSJOB=/usr/local/bin/psjob
PESTAT=/usr/local/bin/pestat
# Mail program
MAIL=/usr/bin/mailx
# Syslog logfile
LOGFILE=/var/log/notifybadjob.log
# Whether to kill job or not
action='Notify about'
killjob=0
# Temporary files
MESSAGE=/tmp/jobstatus.$$
JOBMSGS=/tmp/jobmsgs.$$
JOBERRS=/tmp/joberrs.$$
# Catch signals
trap "rm -f $MESSAGE $JOBMSGS $JOBERRS; exit 2" 1 2 3 14 15 19
# Process command arguments
while getopts "k" options; do
case $options in
k ) killjob=1
shift;;
* ) echo $USAGE
exit 1;;
esac
done
# Get the jobid as the arguments
JOBLIST="$*"
if test -z "$JOBLIST"
then
echo $USAGE
exit 1
fi
#
# Inquire about the reason for notifying/killing this job
#
reason01="Your job is doing no useful work and is essentially dead."
reason02="Your job has grossly exceeded the available physical RAM memory and is very inefficient."
reason03="Your job has grossly exceeded the physical RAM memory available per CPU core."
reason04="Your job is running too many processes/threads and is overloading the CPU(s)."
reason05="Your job is using more CPU cores than your job has requested."
reason06="Your job is not using all of the CPU cores that you have requested."
reason07="Your job is not laid out correctly for xeon8 multi-CPU nodes - you MUST use the xeon8 partition correctly."
reason08="Your job is not laid out correctly for xeon16 multi-CPU nodes - you MUST use the xeon16 partition correctly."
reason09="Your job is not laid out correctly for xeon24 multi-CPU nodes - you MUST use the xeon24 partition correctly."
reason97="Your job is not laid out correctly for xeon56 multi-CPU nodes - you MUST use the xeon56 partition correctly."
reason98="Your job is not laid out correctly for sm3090 multi-CPU nodes - you MUST use the sm3090 partition correctly."
reason99="Your job is not laid out correctly for xeon40 multi-CPU nodes - you MUST use the xeon40 partition correctly."
reason10="Your job is unfortunately running on a node that has a hardware or electrical error."
reason11="Error in the input file caused job to fail."
reason12="Job failed - please examine the output file."
reason13="SCF convergence problems - please examine the output file."
reason14="Your job seems to be very inefficient with a low CPU utilization."
reason15="Your job is doing too heavy I/O on fileserver or local disk."
reason16="Most likely you did not use gpaw-qsub tool. Your job is not using all of the CPU cores that you have requested."
reason17="Your GPAW job has grossly exceeded the available physical RAM memory and is very inefficient. Please test the memory requirement as described at https://wiki.fysik.dtu.dk/gpaw/documentation/parallel_runs/parallel_runs.html"
reason18="Job failed due to exceeded disk quota - please provide $ADMINMAIL with an estimation of your necessary disk space."
reason19="Your job requests resources that cannot be satisfied or do not exist."
reason20="Please ask $ADMINMAIL for the reason."
reason21="Your MPI job uses 1 Gbit Ethernet in stead of the fast OPA/IB, please correct your MPI command."
ANS=-1
while test $ANS -le 0
do
echo
echo Please select one of the following reasons why you want to $action this job:
echo " 1. $reason01"
echo " 2. $reason02"
echo " 3. $reason03"
echo " 4. $reason04"
echo " 5. $reason05"
echo " 6. $reason06"
echo " 7. $reason07"
echo " 8. $reason08"
echo " 9. $reason09"
echo " 97. $reason97"
echo " 98. $reason98"
echo " 99. $reason99"
echo " 10. $reason10"
echo " 11. $reason11"
echo " 12. $reason12"
echo " 13. $reason13"
echo " 14. $reason14"
echo " 15. $reason15"
echo " 16. $reason16"
echo " 17. $reason17"
echo " 18. $reason18"
echo " 19. $reason19"
echo " 20. $reason20"
echo " 21. $reason21"
read -p "Please enter reason (no default): " ANS
case $ANS in
1) action_reason=$reason01;;
2) action_reason=$reason02;;
3) action_reason=$reason03;;
4) action_reason=$reason04;;
5) action_reason=$reason05;;
6) action_reason=$reason06;;
7) action_reason=$reason07;;
8) action_reason=$reason08;;
9) action_reason=$reason09;;
97) action_reason=$reason97;;
98) action_reason=$reason98;;
99) action_reason=$reason99;;
10) action_reason=$reason10;;
11) action_reason=$reason11;;
12) action_reason=$reason12;;
13) action_reason=$reason13;;
14) action_reason=$reason14;;
15) action_reason=$reason15;;
16) action_reason=$reason16;;
17) action_reason=$reason17;;
18) action_reason=$reason18;;
19) action_reason=$reason19;;
20) action_reason=$reason20;;
21) action_reason=$reason21;;
*) ANS=-1;;
esac
done
echo Reason: $action_reason
echo
#
# Big loop over jobs
#
for jobid in $JOBLIST
do
#
# Define strings to be used in the report
#
if test $killjob -eq 1
then
action='Kill'
action_done='killed'
action_print="Your $CLUSTERNAME job id $jobid has been ${action_done} by the superuser."
else
action='Notify about'
action_done='investigated'
action_print="Please contact ${ADMINMAIL}: your $CLUSTERNAME job id $jobid may have to be killed."
fi
# Check if this jobid can be inquired successfully.
JOBSTATE="`squeue -h -O State -j $jobid`"
if test "$?" != "0"
then
echo Error inquiring about job $jobid
exit 1
fi
# Detect job arrays by counting number of words in JOBSTATE
words=( $JOBSTATE )
if [[ ${#words[@]} > 1 ]]
then
echo "ERROR: The job $jobid is a job array with multiple jobs. Please select only one of the array jobs:"
squeue -O JobArrayID,ArrayJobID,ArrayTaskID,JobID,StartTime,TimeUsed,TimeLimit -j $jobid
exit 1
fi
echo -n Get job information...
scontrol --details show job $jobid >$JOBMSGS 2>$JOBERRS
# Check for errors from job status
if test -s $JOBERRS
then
echo
cat $JOBERRS
exit 1
fi
# Get user E-mail address and name (job arrays handled by uniq)
USERID=`squeue -h -O UserName: -j $jobid | uniq`
if test -z "$USERID"
then
echo ERROR: Could not get userid of job $jobid
exit 1
fi
# USERMAIL=`grep Job_Owner $JOBMSGS | awk '{print $3}'`
USERMAIL=$USERID
FULLNAME=`getent passwd | grep "^$USERID:" | awk -F: '{print $5}'`
# Maybe the user wants to notify this address also
# NOTIFYMAIL=`grep Mail_Users $JOBMSGS | awk '{print $3}'`
NOTIFYMAIL=""
cat <<EOF
Job $jobid belongs to user $USERID, full name is $FULLNAME
User groups: `groups $USERID`
Action: ${action} a badly behaving job id $jobid
EOF
#
# Initialize message to the user
#
cat <<EOF > $MESSAGE
*** WARNING ***
${action_print}
Reason: ${action_reason}
If you have any questions about this action, please contact ${ADMINMAIL}.
You may want to consult the Niflheim Wiki page about batch job information:
$INFOURL
In the following we display various pieces of information about your badly behaving batch job.
Slurm batch system information about job id $jobid:
---------------------------------------------------
EOF
# Copy the job status info from Slurm
cat $JOBMSGS >> $MESSAGE
# Is the job still in a queued state ?
JOBSTATE=`grep JobState= $JOBMSGS | awk '{print $1}'`
echo NOTE: This job has a state of $JOBSTATE
if test "$JOBSTATE" = "JobState=PENDING"
then
JOBQUEUED=1
else
JOBQUEUED=0
fi
# Only print job information for running jobs
if test "$JOBQUEUED" = 0
then
# Memory and CPU usage
cat <<EOF >> $MESSAGE
RAM-memory usage and CPU-load usage of your job on the job nodes.
-----------------------------------------------------------------
Please look at these usage numbers to determine why the job was behaving badly
(note especially items marked by *):
node state load pmem ncpu mem resi usrs tasks jobids/users
EOF
# Print job memory and CPU usage by pestat
# echo -n RAM and CPU usage...
$PESTAT -j $jobid >> $MESSAGE
cat <<EOF >> $MESSAGE
Explanation of some columns in the usage list:
node: The compute node running your job.
load: The CPU load average (should not exceed the number of physical CPUs).
pmem: Physical memory (MB) in the node.
ncpu: Number of physical CPUs in the node.
resi: Resident memory (MB) in use (should not exceed the physical memory pmem by too much).
EOF
# Information about the job processes on the nodes
cat <<EOF >> $MESSAGE
Process information on the nodes of your job.
---------------------------------------------
Please look at these processes to determine why the job was behaving badly:
EOF
# Print job processes by sshjob (local command)
echo -n Process status...
$PSJOB $jobid 2>&1 >> $MESSAGE
fi
# Amount of job wallclock time
#WALLTIME=`grep resources_used.walltime $JOBMSGS | awk '{print $3}'`
#if test -z "$WALLTIME"
#then
# WALLTIME="Walltime:unknown"
#fi
# List of nodes used
NODELIST=`grep NodeList= $JOBMSGS | awk -F= '{print $2}'`
if test -z "$NODELIST"
then
NODELIST="Nodes:unknown"
fi
# Working directory
echo -n Listing files...
WORKDIR=`grep WorkDir= $JOBMSGS | awk -F= '{print $2}'`
if test -z "$WORKDIR"
then
WORKDIR="WORKDIR:unknown"
else
# Check how many files in workdir
NUMFILESINDIR=`ls -f $WORKDIR | wc -l`
if test $NUMFILESINDIR -lt 1000
then
# If < 1000 files then list them
MAXFILES=20
cat <<EOF >> $MESSAGE
Information about files in job working directory
------------------------------------------------
Job $jobid working directory $WORKDIR
This directory contains $NUMFILESINDIR files in total.
Newest files in this directory are (max. $MAXFILES files):
EOF
ls -lt $WORKDIR | head -$MAXFILES >> $MESSAGE
else
# If > 1000 files then issue a warning
cat <<EOF >> $MESSAGE
Information about files in job working directory
------------------------------------------------
Job $jobid working directory $WORKDIR
This directory contains $NUMFILESINDIR files in total.
NOTICE: This is a very large number of files in a single directory,
and this may possibly cause slow job behavior.
EOF
fi
fi
echo Done.
#
# Delete the job
#
if [ "${killjob}" = "1" ]; then
echo Now deleting job $jobid
echo Reason: "$action_reason"
scancel $jobid
#
# Write statistics to the Syslog logfile
#
if test ! -f $LOGFILE
then
echo Creating logfile $LOGFILE
touch $LOGFILE
fi
# Time of killing the job
TIMESTAMP=`date +"%Y %b %e %T"`
# Append a logfile entry
echo Adding entry to logfile $LOGFILE
echo $TIMESTAMP $jobid $USERID $WORKDIR $WALLTIME $NODELIST ${action_reason} >> $LOGFILE
fi
# Send mail to the user
echo Sending mail to user=$USERMAIL full name: $FULLNAME
# (echo Dear $FULLNAME ; cat $MESSAGE) | $MAIL -s "WARNING: ${action_print}" -b $SUPERUSERMAIL $USERMAIL $NOTIFYMAIL
(echo Dear $FULLNAME ; cat $MESSAGE) | $MAIL -s "WARNING: ${action_print}" -b $SUPERUSERMAIL $USERMAIL $NOTIFYMAIL
# End of loop over jobids
done
# Clean up
rm -f $MESSAGE $JOBMSGS $JOBERRS