-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathmain.yml
52 lines (47 loc) · 1.84 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
---
- name: "Deploy bash script"
copy:
content: |
#!/bin/bash
CAP=524288 # 512GB
RESUBMIT_CAP=2 # only N resubmission is allowed
MULTIPLIER=3
LOG=/data/dnb01/maintenance/condor_rerun_held_jobs.log
if [ ! -f "$LOG" ]; then
touch "$LOG"
echo "Created $LOG"
fi
for j in $(condor_q -hold -autoformat ClusterId HoldReasonCode| awk '(($2-34) == 0){print $1}'| paste -s -d ' ')
do
NUMBER_OF_TIMES_SUBMITTED=$(grep -c "$j" "$LOG")
if [ $NUMBER_OF_TIMES_SUBMITTED -gt $RESUBMIT_CAP ]; then
condor_rm -reason "This job was resubmitted $RESUBMIT_CAP times. Most likely because of running out of memory." "$j"
continue
fi
JOB_DESCRIPTION=$(condor_q "$j" -autoformat JobDescription)
MEMORY_PROVISIONED=$(condor_q "$j" -autoformat MemoryProvisioned)
if [ $(($MEMORY_PROVISIONED * $MULTIPLIER)) -gt $CAP ]; then
REQUEST_MEMORY=$CAP
else
REQUEST_MEMORY=$(($MEMORY_PROVISIONED * $MULTIPLIER))
fi
REMOTE_HOST=$(condor_q "$j" -autoformat LastRemoteHost|cut -f2 -d@|cut -f1 -d.)
DATE_WITH_TIME=$(date "+%d/%m/%Y-%H:%M:%S")
/bin/cat <<EOM >>$LOG
$DATE_WITH_TIME, rerunning held job, id $j, description $JOB_DESCRIPTION, memory_provisioned $MEMORY_PROVISIONED, request_memory $REQUEST_MEMORY, $REMOTE_HOST
EOM
condor_qedit "$j" RequestMemory=$REQUEST_MEMORY
condor_release "$j"
done
dest: /usr/bin/htcondor-release-held-jobs
owner: root
group: root
mode: 0755
- name: Add Condor release task to cron
cron:
name: "Condor release held jobs increasing memory"
minute: "*/15"
hour: "*"
dow: "*"
job: "/usr/bin/htcondor-release-held-jobs"
user: "{{ galaxy_user.name }}"