-
Notifications
You must be signed in to change notification settings - Fork 3
/
fabtrun
executable file
·501 lines (434 loc) · 11.4 KB
/
fabtrun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
#!/bin/sh
#
# Notes about making this script, `fabtrun`, multi-node are labeled `MN`.
#
# MN: The general idea is that this script should have two operating
# modes, call them "server" mode and "client" mode. Where the existing
# script starts an fabtget process and an fabtput process in each step,
# the script in server mode would start one process in each step (e.g.,
# fabtget). A counterpart script in client mode would start the other
# process in each step (e.g., fabtput). Then server and client will
# wait for the programs they started to finish or fail before moving on
# to the next step.
#
# MN: Right now fabtrun creates a temporary directory when it starts.
# It uses that temporary directory to save some state variables.
# The client and server should share that directory. The script that
# kicks off the multi-node test probably should create the directory and tell
# each client & server instance what the directory is called using an
# environment variable or command-line parameter.
#
# MN: Currently, fabtrun runs every test step in two test
# phases (one phase for get, one phase for put). Then it produces
# a report. In one flexible approach to multi-node testing, each
# client-mode/server-mode instance will run only a single step; then the
# script that runs the batch job will synchronize through filesystem
# fiddling or network messages; then the batch script will start each
# instance on the next step. In this way all steps, 1..n, are run.
# Finally, the batch script can run step n+1, the reporting
# step, on a solitary node, and capture the output.
#
# MN: To test multiple clients per server, fabtrun needs to be modified
# to pass `-k` and `-n` to `fabtput` and `-n` to `fabtget`.
# It may make the most sense to do that after making the other changes.
#
# See other `MN` markings, below, for more advice.
#
set -e
set -u
ntests=0
npass=0
nfail=0
prog=$(basename $0)
timed_out=no
timeout_default=600
cancel_timeout_default=2
# Set to anything else to emit extra information
verbose=no
bail()
{
echo "$@" 1>&2
exit 1
}
child_handler()
{
trap child_handler CHLD
if [ ${verbose} != "no" ]; then
echo "$prog ignored SIGCHLD." 1>&2
fi
}
alarm_handler()
{
trap alarm_handler ALRM
echo "$prog ignored SIGALRM" 1>&2
}
usr1_handler()
{
trap usr1_handler USR1
echo "$prog timed out, canceling tests." 1>&2
timed_out=yes
}
exit_handler()
{
trap - EXIT HUP INT PIPE QUIT
if [ ${tmpdir:-none} != none ]; then
rm -rf $tmpdir
fi
}
usage()
{
echo "usage: ${prog}" 1>&2
exit 1
}
random_fail()
{
if [ ${FABTSUITE_RANDOM_FAIL:-no} = no ]; then
return 1
fi
r=$(dd if=/dev/random bs=4 count=1 2> /dev/null | od -A n -t u4)
if [ $r -ge $((2**31)) ]; then
return 0
else
return 1
fi
}
print_flagset_line()
{
flagset=$1
result=$2
read kw_realtime realtime
read discard || true
read discard || true
ntests=$(($ntests + 1))
if [ x${kw_realtime:-none} != xreal ]; then
nfail=$(($nfail + 1))
printf "%-31.31s %8s %-24s %s\\n" \
"$(echo $flagset | sed 's/,/ /g')" - - fail
return 1
elif [ ${result:-none} = ok ]; then
npass=$(($npass + 1))
else
nfail=$(($nfail + 1))
fi
if [ ${flagset} = "default" ]; then
default_realtime=${realtime}
printf "%-31.31s %8.2f %-24s %s\\n" "default" \
$realtime - $result
elif [ $(dc -e "[[t] p q] st $default_realtime 0 =t [f] p q") = "t" ]
then
printf "%-31.31s %8.2f %-24s %s\\n" \
"$(echo $flagset | sed 's/,/ /g')" $realtime - $result
else
printf "%-31.31s %8.2f %-24.0f %s\\n" \
"$(echo $flagset | sed 's/,/ /g')" $realtime \
$(dc -e "2 k $realtime $default_realtime / 100 * p") \
$result
fi
}
env_for_flagset()
{
flagset=$1
env=
for flag in $(echo $flagset | sed 's/,/ /g'); do
case $flag in
cacheless)
env="FI_MR_CACHE_MAX_SIZE=0 ${env}"
;;
contiguous)
;;
default)
;;
reregister)
;;
wait)
;;
esac
done
echo $env
}
counterpart_cmd_for_flagset()
{
flagset=$1
shift
cmd="$@"
for flag in $(echo $flagset | sed 's/,/ /g'); do
case $flag in
cancel)
cmd="timeout --preserve-status -s INT ${FABTSUITE_CANCEL_TIMEOUT:-$cancel_timeout_default} $cmd -c"
;;
cacheless)
;;
contiguous)
;;
default)
;;
reregister)
;;
wait)
;;
esac
done
echo $cmd
}
cmd_for_flagset()
{
flagset=$1
shift
cmd="$@"
for flag in $(echo $flagset | sed 's/,/ /g'); do
case $flag in
cancel)
cmd="timeout --preserve-status -s INT ${FABTSUITE_CANCEL_TIMEOUT:-$cancel_timeout_default} $cmd -c"
;;
cacheless)
;;
contiguous)
cmd="$cmd -g"
;;
default)
;;
reregister)
cmd="$cmd -r"
;;
wait)
cmd="$cmd -w"
;;
esac
done
echo $cmd
}
print_footer()
{
cat<<FOOTER_EOF
key:
parameters:
default: register each RDMA buffer once, use scatter-gather RDMA
cancel: -c, send SIGINT to cancel after 3 seconds
cacheless: env FI_MR_CACHE_MAX_SIZE=0, disable memory-registration cache
contiguous: -g, RDMA conti(g)uous bytes, no scatter-gather
reregister: -r, deregister/(r)eregister each RDMA buffer before reuse
wait: -w, wait for I/O using epoll_pwait(2) instead of fi_poll(3)
duration: elapsed real time in seconds
duration/default: elapsed real time as a percentage of the duration
measured with the default parameter set
${ntests} tests, ${npass} succeeded, ${nfail} failed
FOOTER_EOF
}
print_report()
{
which=$1
default_realtime=0
cat<<HEADING_EOF
${which} parameter set duration (s) duration/default (%) result
--------------------------------------------------------------------------
HEADING_EOF
# rely on flagset `default` to be first, *wince*.
for flagset in $(eval echo \$${which}_flagset); do
result=$tmpdir/${which}-phase-${which}.${flagset}.result
timing=$tmpdir/${which}-phase-${which}.${flagset}.timing
if [ -e $timing ]
then
print_flagset_line $flagset \
$(cat $result 2> /dev/null || echo fail) \
< $timing || continue
else
print_flagset_line $flagset \
$(cat $result 2> /dev/null || echo fail) \
< /dev/null || continue
fi
done
}
if [ ${FABTSUITE_TIMEOUT_SET:-no} = no ]; then
FABTSUITE_TIMEOUT_SET=yes timeout -s USR1 \
${FABTSUITE_TIMEOUT:-$timeout_default} sh "$0" "$@"
exit $?
fi
trap alarm_handler ALRM
trap usr1_handler USR1
trap exit_handler EXIT HUP INT PIPE QUIT
trap child_handler CHLD
# MN: this is where the temporary directory that stores the script
# state is created.
if ! tmpdir=$(mktemp -d ${prog}.XXXXXX) ; then
echo "could not create temporary directory, bailing." 1>&2
exit 1
fi
if [ $# -ne 0 ]; then
usage $0
fi
generic_flagset="default cancel cacheless reregister cacheless,reregister wait"
get_flagset=$generic_flagset
put_flagset="$generic_flagset contiguous contiguous,reregister"
put_flagset="$put_flagset contiguous,reregister,cacheless"
#
# MN: This is where fabtrun loops over every test step in the `get`
# phase. One quick & easy way to make the script run a solitary step,
# `n`, is to just loop over all steps in both phases, counting up on a
# variable `i` in each loop. If `i == n` or if `n` was not provided,
# then run the step; if `i != n`, then `continue`. This preserves the
# logic and code of the single-node script. Just an idea.
#
for flagset in $get_flagset; do
genv=$(env_for_flagset $flagset)
gcmd=$(cmd_for_flagset $flagset "fabtget -a $tmpdir/addr")
rm -f $tmpdir/addr
if [ ${timed_out:-no} = yes ]; then
break
fi
#
# MN: start the `get` process. Only do this in server mode.
#
{
{ env $genv time -p /bin/sh -c "$gcmd 2>&3" 2>&1 ; } \
> $tmpdir/get-phase-get.${flagset}.timing 3>&2 &
pid=$!
echo $pid > $tmpdir/get-phase-get.${flagset}.pid
if wait $pid && ! random_fail && [ ${timed_out:-no} = no ]
then
echo ok
else
xargs kill -9 < $tmpdir/get-phase-put.${flagset}.pid \
2> /dev/null || true
echo fail
fi
} > $tmpdir/get-phase-get.${flagset}.result &
wgpid=$!
pcmd=$(counterpart_cmd_for_flagset $flagset fabtput)
#
# MN: wait for the `get` process to get started. Only do this
# in client mode.
#
while ! [ -e $tmpdir/addr ] && ! [ ${timed_out:-no} = yes ] && \
! grep -q fail $tmpdir/get-phase-get.${flagset}.result; do
: # spin rudely
done
if [ ${timed_out:-no} = yes ]; then
xargs kill -9 < $tmpdir/get-phase-get.${flagset}.pid \
2> /dev/null || true
wait $wgpid
continue
fi
if grep -q fail $tmpdir/get-phase-get.${flagset}.result; then
continue
fi
#
# MN: in client mode, start the `put` process.
#
{
$pcmd $(cat $tmpdir/addr) &
pid=$!
echo $pid > $tmpdir/get-phase-put.${flagset}.pid
if wait $pid
then
echo ok
else
xargs kill -9 < $tmpdir/get-phase-get.${flagset}.pid \
2> /dev/null || true
echo fail
fi
} > $tmpdir/get-phase-put.${flagset}.result &
echo "phase get, testing parameter set $flagset" 1>&2
while ! wait; do
echo "re-awaiting background processes for $flagset" 1>&2
done
done
#
# MN: This is where fabtrun loops over every test step in the `put`
# phase.
#
for flagset in $put_flagset; do
penv=$(env_for_flagset $flagset)
pcmd=$(cmd_for_flagset $flagset "fabtput")
gcmd=$(counterpart_cmd_for_flagset $flagset \
fabtget -a $tmpdir/addr)
rm -f $tmpdir/addr
if [ ${timed_out:-no} = yes ]; then
break
fi
#
# MN: in server mode, start the `get` process
#
{
$gcmd &
pid=$!
echo $pid > $tmpdir/put-phase-get.${flagset}.pid
if wait $pid
then
echo ok
else
xargs kill -9 < $tmpdir/put-phase-put.${flagset}.pid \
2> /dev/null || true
echo fail
fi
} > $tmpdir/put-phase-get.${flagset}.result &
wgpid=$!
#
# MN: in client mode, wait for the `get` process to start.
#
while ! [ -e $tmpdir/addr ] && [ ${timed_out:-no} != yes ] && \
! grep -q fail $tmpdir/put-phase-get.${flagset}.result; do
: # spin rudely
done
if [ ${timed_out:-no} = yes ]; then
xargs kill -9 < $tmpdir/put-phase-get.${flagset}.pid || true
wait $wgpid
continue
fi
if grep -q fail $tmpdir/put-phase-get.${flagset}.result; then
continue
fi
#
# MN: in client mode, start the `put` process
#
{
{ env $penv time -p /bin/sh -c \
"$pcmd $(cat $tmpdir/addr) 2>&3" 2>&1 ; } \
> $tmpdir/put-phase-put.${flagset}.timing 3>&2 &
pid=$!
echo $pid > $tmpdir/put-phase-put.${flagset}.pid
if wait $pid && ! random_fail && [ ${timed_out:-no} = no ]
then
echo ok
else
#
# MN: tricky! If the `put` process fails, you need
# to stop the `get` process prematurely or else the
# test may hang until it times out. Then you
# won't get a complete test report. The client
# needs to indicate to the batch script and/or
# the server that the `get` process should be killed
# off.
#
xargs kill -9 < $tmpdir/put-phase-get.${flagset}.pid \
2> /dev/null || true
echo fail
fi
} > $tmpdir/put-phase-put.${flagset}.result &
#
# MN: this synchronization with `put` and `get` processes probably
# needs to be performed by the batch script, since the exact details
# may vary based on the platform and batch-submission system.
#
echo "phase put, testing parameter set $flagset" 1>&2
while ! wait; do
echo "re-awaiting background processes for $flagset" 1>&2
done
done
#
# MN: Print the report: step `n+1`.
#
print_report get
echo
print_report put
print_footer
#
# MN: addendum to the report; the timed-out condition probably should
# be written to the temporary directory or sent somehow over the network
# so that it's globally available.
#
if [ ${timed_out:-no} != no ]; then
echo
echo "*** A timeout occurred before all tests were run. ***"
exit 1
fi
exit 0