-
Notifications
You must be signed in to change notification settings - Fork 2
/
node.sh
executable file
·109 lines (84 loc) · 4.2 KB
/
node.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
(
flock -e 200
# $HOSTNAME fails, obtain from /etc/HOSTNAME
HOSTNAME=`cat /etc/HOSTNAME`
# determine the node index
RANK=$PRUN_CPU_RANK
#
# 1 - parameters
#
CONFIG="$1"
if [ ! -f "$CONFIG" ]; then
echo "`date` ${HOSTNAME} error: $0 CONFIGFILE"
exit 1
fi
#
# 2 - environment
#
# prepare local working directory
if [ -d "/local/$USER" ]; then
echo "`date` ${HOSTNAME} warning: local workdir already exists (removing...)"
rm -rf "/local/$USER"
fi
mkdir "/local/$USER"
LOCALCONFIG="/local/$USER/config"
cp -r "$CONFIG" "$LOCALCONFIG"
source "$LOCALCONFIG"
if [ ! "$SGE_KEEP_TMPFILES" == "no" ]; then
echo "`date` ${HOSTNAME} error: SGE_KEEP_TMPFILES should be 'no'"
echo "`date` ${HOSTNAME} error: this property must be set in .bashrc [${HOME}/.bashrc]"
exit 2
fi
# start of runtime
STARTSTAMP=`date +%s`
#
# 3 - setup peer
#
# copy python branch to the local disk
LOCALCODEDIR="/local/$USER/localcodedir"
cp -r "$PYTHONCODEDIR" "$LOCALCODEDIR"
export PYTHONPATH="$LOCALCODEDIR:$PYTHONPATH"
# calculate the PEERNUMBER range that this node is responsible for
LOWPEERNUMBER=`python -c "print $TASKS / $HOSTS * $RANK + min($TASKS % $HOSTS, $RANK)"`
HIGHPEERNUMBER=`python -c "print $TASKS / $HOSTS * ($RANK + 1) + min($TASKS % $HOSTS, $RANK + 1) - 1"`
echo "= `date` ${HOSTNAME} start [$LOWPEERNUMBER:$HIGHPEERNUMBER]"
for (( BATCHNUMBER=LOWPEERNUMBER; BATCHNUMBER<=HIGHPEERNUMBER; BATCHNUMBER+=BATCHSIZE )); do
for (( PEERNUMBER=BATCHNUMBER; PEERNUMBER<BATCHNUMBER+BATCHSIZE && PEERNUMBER<=HIGHPEERNUMBER; PEERNUMBER++ )); do
PEER_WORKDIR="/local/${USER}/${HOSTNAME}_${PEERNUMBER}"
if [ -d "$PEER_WORKDIR" ]; then
echo "`date` ${HOSTNAME} error: PEER_WORKDIR already exists [$PEER_WORKDIR]"
exit 3
fi
mkdir "$PEER_WORKDIR"
cd "$PEER_WORKDIR"
if [ -f "$LOGGERCONF" ]; then
cp "$LOGGERCONF" logger.conf
fi
if [ -f "$ZEROLOGGERCONF" ] && [ "$PEERNUMBER" == "0" ]; then
cp "$ZEROLOGGERCONF" logger.conf
fi
if [ -f "$BOOTSTRAPFILE" ]; then
cp "$BOOTSTRAPFILE" bootstraptribler.txt
fi
# output to stdout and stderr files
# $BINARY $BINARYPARAMS "$LOCALCODEDIR/peer.py" $DISPERSYPARAMS --kargs "${DISPERSYKARGS},resultdir=${RESULTDIR},localcodedir=${LOCALCODEDIR},startstamp=${STARTSTAMP},scenario=${LOCALCONFIG},peernumber=${PEERNUMBER},peercount=${TASKS},lowpeernumber=${LOWPEERNUMBER},highpeernumber=${HIGHPEERNUMBER}" >stdout 2>stderr &
# output to stdout, stderr files, and console
# $BINARY $BINARYPARAMS "$LOCALCODEDIR/peer.py" $DISPERSYPARAMS --kargs "${DISPERSYKARGS},resultdir=${RESULTDIR},localcodedir=${LOCALCODEDIR},startstamp=${STARTSTAMP},scenario=${LOCALCONFIG},peernumber=${PEERNUMBER},peercount=${TASKS},lowpeernumber=${LOWPEERNUMBER},highpeernumber=${HIGHPEERNUMBER}" > >(tee stdout) 2> >(tee stderr >&2) &
# output to stdout, stderr files, and stderr to console
$BINARY $BINARYPARAMS "$LOCALCODEDIR/peer.py" $DISPERSYPARAMS --log-identifier "${HOSTNAME}_${PEERNUMBER}" --kargs "${DISPERSYKARGS},resultdir=${RESULTDIR},localcodedir=${LOCALCODEDIR},startstamp=${STARTSTAMP},scenario=${LOCALCONFIG},peernumber=${PEERNUMBER},peercount=${TASKS},lowpeernumber=${LOWPEERNUMBER},highpeernumber=${HIGHPEERNUMBER}" >stdout 2> >(tee stderr >&2) &
done
# batch delay
sleep $BATCHDELAY
done
# wait for all processes to finish
echo "= `date` ${HOSTNAME} wait [$LOWPEERNUMBER:$HIGHPEERNUMBER]"
wait
echo "= `date` ${HOSTNAME} copy [$LOWPEERNUMBER:$HIGHPEERNUMBER]"
# remove local branch and scenario
rm -rf "$LOCALCONFIG" "$LOCALCODEDIR"
# copy results (note that the '/' behind ${USER} ensures the content of that directory is copied)
rsync $RSYNCPARAMS --archive "/local/${USER}/" "${USER}@fs3:${RESULTDIR}"
# done
echo "= `date` ${HOSTNAME} done [$LOWPEERNUMBER:$HIGHPEERNUMBER]"
) 200>/local/lockfile.$USER