-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy patht1011-dynstate-change.t
executable file
·215 lines (180 loc) · 6.21 KB
/
t1011-dynstate-change.t
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/bin/sh
test_description='Test Fluxion on Dynamic Resource State Changes'
. `dirname $0`/sharness.sh
mkdir -p config
hwloc_basepath=`readlink -e ${SHARNESS_TEST_SRCDIR}/data/hwloc-data`
# 4 brokers, each (exclusively) have:
# 1 node, 2 sockets, 44 cores (22 per socket), 4 gpus (2 per socket)
excl_4N4B="${hwloc_basepath}/004N/exclusive/04-brokers-sierra2"
export FLUX_SCHED_MODULE=none
test_under_flux 4 full -o,--config-path=$(pwd)/config
test_expect_success 'dyn-state: generate jobspecs' '
flux run --dry-run -N 4 -n 4 -c 44 -g 4 -t 1h \
sleep 3600 > basic.json &&
flux run --dry-run -N 1 -n 1 -c 44 -g 4 -t 1h \
sleep 3600 > 1N.json &&
flux run --dry-run -N 4 -n 4 -c 45 -g 4 -t 1h \
sleep 3600 > unsat.json &&
flux run --dry-run -N 4 -n 4 -c 44 -g 4 -t 1h --queue=debug \
sleep 3600 > basic.debug.json &&
flux run --dry-run -N 1 -n 1 -c 44 -g 4 -t 1h --queue=debug \
sleep 3600 > 1N.debug.json &&
flux run --dry-run -N 4 -n 4 -c 44 -g 4 -t 1h --queue=batch \
sleep 3600 > basic.batch.json &&
flux run --dry-run -N 1 -n 1 -c 44 -g 4 -t 1h --queue=batch \
sleep 3600 > 1N.batch.json
'
test_expect_success 'load test resources' '
load_test_resources ${excl_4N4B}
'
test_expect_success 'dyn-state: loading fluxion modules works' '
load_resource match-format=rv1 &&
load_qmanager
'
test_expect_success 'dyn-state: a full-size job can be scheduled and run' '
jobid1=$(flux job submit basic.json) &&
flux job wait-event -t 2 ${jobid1} start
'
test_expect_success 'dyn-state: node drain does not kill the job' '
flux resource drain 1 &&
test_must_fail flux job wait-event -t 1 ${jobid1} finish
'
test_expect_success 'dyn-state: killing the job on the drained node works' '
flux cancel ${jobid1} &&
flux job wait-event -t 10 ${jobid1} clean
'
test_expect_success 'dyn-state: undrain' '
flux resource undrain 1
'
test_expect_success 'dyn-state: the drained node with a job not used' '
jobid1=$(flux job submit 1N.json) &&
flux job wait-event -t 10 ${jobid1} start &&
rank=$(flux job info ${jobid1} R \
| jq " .execution.R_lite[0].rank ") &&
rank=${rank%\"} && rank=${rank#\"} &&
jobid2=$(flux job submit 1N.json) &&
jobid3=$(flux job submit 1N.json) &&
jobid4=$(flux job submit 1N.json) &&
flux job wait-event -t 10 ${jobid4} start &&
flux resource drain ${rank} &&
flux cancel ${jobid1} &&
flux job wait-event -t 10 ${jobid1} clean &&
jobid5=$(flux job submit 1N.json) &&
test_must_fail flux job wait-event -t 1 ${jobid5} start
'
test_expect_success 'dyn-state: cancel all jobs' '
flux cancel ${jobid2} &&
flux cancel ${jobid3} &&
flux cancel ${jobid4} &&
flux cancel ${jobid5} &&
flux job wait-event -t 10 ${jobid2} clean &&
flux job wait-event -t 10 ${jobid3} clean &&
flux job wait-event -t 10 ${jobid4} clean &&
flux job wait-event -t 10 ${jobid5} clean &&
flux resource undrain ${rank}
'
test_expect_success 'dyn-state: unsatifiability check works' '
jobid1=$(flux job submit unsat.json) &&
flux job wait-event -t 2 ${jobid1} clean &&
flux job eventlog ${jobid1} | grep unsatisfiable
'
test_expect_success 'dyn-state: drain prevents a full job from running' '
flux resource drain 0 &&
jobid1=$(flux job submit basic.json) &&
test_must_fail flux job wait-event -t 1 ${jobid1} start &&
flux cancel ${jobid1}
'
test_expect_success 'dyn-state: a full job blocks a later job under fcfs' '
jobid1=$(flux job submit basic.json) &&
jobid2=$(flux job submit 1N.json) &&
test_must_fail flux job wait-event -t 1 ${jobid2} start &&
flux cancel ${jobid1} &&
flux cancel ${jobid2}
'
test_expect_success 'dyn-state: correct unsatifiability after drain' '
jobid1=$(flux job submit unsat.json) &&
flux job wait-event -t 2 ${jobid1} clean &&
flux job eventlog ${jobid1} | grep unsatisfiable &&
flux resource undrain 0
'
test_expect_success 'dyn-state: removing fluxion modules' '
remove_qmanager &&
remove_resource
'
test_expect_success 'dyn-state: loading fluxion modules works' '
load_resource match-format=rv1 &&
load_qmanager queue-policy=easy
'
test_expect_success 'dyn-state: a full job skipped for a later job under easy' '
flux resource drain 3 &&
jobid1=$(flux job submit basic.json) &&
jobid2=$(flux job submit 1N.json) &&
flux job wait-event -t 10 ${jobid2} start &&
flux cancel ${jobid2} &&
flux job wait-event -t 10 ${jobid2} clean &&
flux resource undrain 3 &&
flux job wait-event -t 10 ${jobid1} start &&
flux cancel ${jobid1} &&
flux job wait-event -t 10 ${jobid1} clean
'
test_expect_success 'dyn-state: removing fluxion modules' '
remove_qmanager &&
remove_resource
'
test_expect_success 'configure queues' '
cat >config/queues.toml <<-EOT &&
[queues.batch]
[queues.debug]
[policy.jobspec.defaults.system]
queue = "batch"
[sched-fluxion-qmanager]
queue-policy-per-queue = "batch:easy debug:fcfs"
EOT
flux config reload &&
flux queue start --all
'
test_expect_success 'dyn-state: loading fluxion modules works' '
load_resource match-format=rv1 &&
load_qmanager
'
test_expect_success 'dyn-state: a full job blocks a later job for fcfs queue' '
flux resource drain 2 &&
jobid1=$(flux job submit basic.debug.json) &&
jobid2=$(flux job submit 1N.debug.json) &&
test_must_fail flux job wait-event -t 1 ${jobid1} start &&
flux cancel ${jobid1} &&
flux cancel ${jobid2} &&
flux job wait-event -t 1 ${jobid2} clean
'
test_expect_success 'dyn-state: a job skipped for a later job for easy queue' '
jobid1=$(flux job submit basic.batch.json) &&
jobid2=$(flux job submit 1N.batch.json) &&
flux job wait-event -t 1 ${jobid2} start &&
flux cancel ${jobid1} &&
flux cancel ${jobid2} &&
flux job wait-event -t 1 ${jobid2} clean
'
test_expect_success 'dyn-state: removing fluxion modules' '
remove_qmanager &&
remove_resource
'
test_expect_success 'unconfigure queues' '
rm -f config/queues.toml &&
flux config reload
'
test_expect_success 'dyn-state: loading fluxion modules works' '
load_resource match-format=rv1 &&
load_qmanager
'
test_expect_success 'dyn-state: a job skipped for a later job for easy queue' '
jobid1=$(flux job submit basic.json) &&
test_must_fail flux job wait-event -t 1 ${jobid1} start
'
test_expect_success 'cleanup active jobs' '
cleanup_active_jobs
'
test_expect_success 'dyn-state: removing fluxion modules' '
remove_qmanager &&
remove_resource
'
test_done