-
Notifications
You must be signed in to change notification settings - Fork 0
/
waitForSlurmJobs_sah.pl
129 lines (98 loc) · 3.13 KB
/
waitForSlurmJobs_sah.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/perl -w
use strict;
# Usage: waitForSlurmJobs.pl <verbose [1 or 0]> <delay in seconds in range 10-600> [job IDs]
#
#
# Takes as args a string of sbatch job IDs and periodically monitors them. Once they all finish, it returns 0
#
# If any of the jobs go into error state, an error is printed to stderr and the program waits for the non-error
# jobs to finish, then returns 1
my ( $verbose, $delay, @jobIDs ) = @ARGV;
my %COMPLETION_STATES = map {
$_ => 1
} qw( COMPLETED );
my %FAILURE_STATES = map {
$_ => 1
} qw(
CANCELLED FAILED NODE_FAIL PREEMPTED TIMEOUT
);
# Validate that the delay is within the acceptable range
if ($delay < 10) {
print STDERR "Sleep period is too short, will poll queue once every 10 seconds\n";
$delay = 10;
} elsif ($delay > 3600) {
print STDERR "Sleep period is too long, will poll queue once every 60 minutes\n";
$delay = 3600;
}
print " Waiting for " . scalar( @jobIDs ) . " jobs: @jobIDs\n";
my $errorsEncountered = 0;
wait_for_all_jobs_to_complete(@jobIDs);
if ($errorsEncountered) {
print " No more jobs to run - some jobs had errors\n\n";
exit 1;
}
else {
print " No more jobs in queue\n\n";
exit 0;
}
sub wait_for_all_jobs_to_complete {
my @pendingJobs = update_pending_jobs(@_);
while (@pendingJobs) {
if ($verbose) {
my $timestamp = `date`;
chomp $timestamp;
printf " ($timestamp) Still waiting for %d jobs\n\n", scalar(@pendingJobs);
}
# Use of backticks rather than system permits a ctrl+c to work
`sleep $delay`;
@pendingJobs = update_pending_jobs(@pendingJobs);
};
}
sub update_pending_jobs {
my (@jobsToQuery) = @_;
my %jobStatuses = query_job_statuses(@jobsToQuery);
if (!scalar(@jobsToQuery) || !%jobStatuses)
{
# No more jobs remain
return ();
}
if ($verbose) {
while (my ($job, $status) = each(%jobStatuses)) {
print(" Job $job is in state $status\n");
}
}
# my @terminatedJobs = grep {
# !exists($jobStatuses{$jobsToQuery[$_]}) || exists($COMPLETION_STATES{$jobStatuses{$jobsToQuery[$_]}})
# } 0..$#jobsToQuery;
my @failedJobs = grep {
exists($jobStatuses{$jobsToQuery[$_]}) && exists($FAILURE_STATES{$jobStatuses{$jobsToQuery[$_]}})
} 0..$#jobsToQuery;
if (@failedJobs) {
$errorsEncountered = 1;
}
# push @terminatedJobs, @failedJobs;
# foreach my $index (reverse(@terminatedJobs)) {
# splice @jobsToQuery, $index, 1;
# }
return @jobsToQuery;
}
sub query_job_statuses {
my (@jobsToQuery) = @_;
my $user = trim(`whoami`);
my $squeueOutput = qx/squeue --noheader --user="$user" --format="%i,%T" --jobs=${\join(',', @jobsToQuery)}/;
my $exitcode = $? >> 8;
my %jobStatuses = ();
if ($exitcode == 0) {
%jobStatuses = map {
my @statusParts = split(",", $_);
$statusParts[0] => $statusParts[1];
} split("\n", trim($squeueOutput));
}
return %jobStatuses;
}
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}