-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_and_time.sh
executable file
·119 lines (105 loc) · 3.39 KB
/
run_and_time.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
#
# runs benchmark and reports time to convergence
# to use the script:
# run_and_time.sh
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
if [[ -f config_${DGXSYSTEM}.sh ]]; then
source config_${DGXSYSTEM}.sh
else
source config_DGX1.sh
echo "Unknown system, assuming DGX1"
fi
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
# run benchmark
SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE:-$DGXNGPU}
OPTIMIZER=${OPTIMIZER:-"sgd"}
BATCHSIZE=${BATCHSIZE:-1664}
KVSTORE=${KVSTORE:-"device"}
LR=${LR:-"0.6"}
LRSCHED=${LRSCHED:-"30,60,80"}
WARMUP_EPOCHS=${WARMUP_EPOCHS:-5}
LARSETA=${LARSETA:-'0.001'}
WD=${WD:-'0.0001'}
LABELSMOOTHING=${LABELSMOOTHING:-'0.0'}
SEED=${SEED:-1}
EVAL_OFFSET=${EVAL_OFFSET:-2}
EVAL_PERIOD=${EVAL_PERIOD:-4}
DALI_PREFETCH_QUEUE=${DALI_PREFETCH_QUEUE:-2}
DALI_NVJPEG_MEMPADDING=${DALI_NVJPEG_MEMPADDING:-64}
DALI_THREADS=${DALI_THREADS:-3}
DALI_CACHE_SIZE=${DALI_CACHE_SIZE:-0}
DALI_ROI_DECODE=${DALI_ROI_DECODE:-0}
NUMEPOCHS=${NUMEPOCHS:-90}
NETWORK=${NETWORK:-"resnet-v1b-fl"}
if [[ "$OPTIMIZER" == "sgdwlars" ]] || [[ "$OPTIMIZER" == "sgdwfastlars" ]]; then
THR="0.759"
else
THR="0.749"
fi
DATAROOT="/data"
echo "running benchmark"
export NGPUS=$SLURM_NTASKS_PER_NODE
GPUS=$(seq 0 $(($NGPUS - 1)) | tr "\n" "," | sed 's/,$//')
PARAMS=(
--gpus "${GPUS}"
--batch-size "${BATCHSIZE}"
--kv-store "${KVSTORE}"
--lr "${LR}"
--lr-step-epochs "${LRSCHED}"
--lars-eta "${LARSETA}"
--label-smoothing "${LABELSMOOTHING}"
--wd "${WD}"
--warmup-epochs "${WARMUP_EPOCHS}"
--eval-period "${EVAL_PERIOD}"
--eval-offset "${EVAL_OFFSET}"
--optimizer "${OPTIMIZER}"
--network "${NETWORK}"
--num-layers "50"
--num-epochs "${NUMEPOCHS}"
--accuracy-threshold "${THR}"
--seed "${SEED}"
--dtype "float16"
--use-dali
--disp-batches "20"
--image-shape "4,224,224"
--fuse-bn-relu "1"
--fuse-bn-add-relu "1"
--min-random-area "0.05"
--max-random-area "1.0"
--conv-algo "1"
--force-tensor-core "1"
--input-layout "NHWC"
--conv-layout "NHWC"
--batchnorm-layout "NHWC"
--pooling-layout "NHWC"
--batchnorm-mom "0.9"
--batchnorm-eps "1e-5"
--data-train "${DATAROOT}/train.rec"
--data-train-idx "${DATAROOT}/train.idx"
--data-val "${DATAROOT}/val.rec"
--data-val-idx "${DATAROOT}/val.idx"
--dali-prefetch-queue "${DALI_PREFETCH_QUEUE}"
--dali-nvjpeg-memory-padding "${DALI_NVJPEG_MEMPADDING}"
--dali-threads "${DALI_THREADS}"
--dali-cache-size "${DALI_CACHE_SIZE}"
--dali-roi-decode "${DALI_ROI_DECODE}"
)
if [[ "${KVSTORE}" == "horovod" ]]; then
DGXSYSTEM=${DGXSYSTEM:-DGX1}
BIND="./ompi_bind_${DGXSYSTEM/_multi*/}.sh"
fi
${BIND} python train_imagenet.py "${PARAMS[@]}"; ret_code=$?
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="IMAGE_CLASSIFICATION"
echo "RESULT,$result_name,,$result,$USER,$start_fmt"