-
Notifications
You must be signed in to change notification settings - Fork 2
/
modified job
82 lines (73 loc) · 2.5 KB
/
modified job
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#BSUB -L /bin/bash
#BSUB -J "BigGAN128-ImageNet"
#BSUB -o "runs/BigGAN128-ImageNet.%J"
#BSUB -e "runs/BigGAN128-ImageNet.%J.err"
#BSUB -n 4
#BSUB -R "span[ptile=1]"
#BSUB -gpu "num=2"
#BSUB -q "normal"
#
# Setup User Environement (Python, WMLCE virtual environment etc)
#
HOME2=/nobackup/users/$(whoami)
touch $HOME2/job_stats.txt
echo "job started" >> $HOME2/job_stats.txt
PYTHON_VIRTUAL_ENVIRONMENT=wmlce-1.6.2
CONDA_ROOT=$HOME2/software/anaconda3
source ${CONDA_ROOT}/etc/profile.d/conda.sh
conda activate $PYTHON_VIRTUAL_ENVIRONMENT
export EGO_TOP=/opt/ibm/spectrumcomputing
N=4
TRAINFILE=$HOME2/trainfile_biggan128_imagenet
rm -rf ${TRAINFILE}
cat > set.sh << EoF_s
#! /bin/sh
# Set up the GPUs; only one process per node should do this
if [ \${OMPI_COMM_WORLD_LOCAL_RANK} -eq 0 ]; then
ppc64_cpu --smt # Verify the SMT mode
fi
EoF_s
chmod +x set.sh
mpirun --tag-output ./set.sh
# Run the program
export PAMI_IBV_ADAPTER_AFFINITY=0
export NCCL_SOCKET_IFNAME=ib
##Run jobs for CPU and GPU usage
#
# output cpu usage for all cpu cores every 5 seconds
currentDateTime=`date +%Y-%m-%d-%H:%M`
cpu_filename="cpu-training-${currentDateTime}.txt"
touch $HOME/$cpu_filename
sar -u 5 >> $HOME/$cpu_filename & SAR_CPU_PID=$!
mem_filename="mem-training-${currentDateTime}.txt"
sar -r 5 >> $HOME/$mem_filename & SAR_MEM_PID=$!
# output gpu usage for all gpus every 5 seconds
gpu_filename="gpu-training-${currentDateTime}.txt"
touch $HOME/$gpu_filename
nvidia-smi --query-gpu=gpu_name,pstate,utilization.gpu,utilization.memory --format=csv -l 5 >> $HOME/$gpu_filename & SMI_PID=$!
#jobs for cpu and gpu usage started
mpirun -np ${N} \
python $HOME2/BigGAN-PyTorch/main.py \
--model biggan \
--dataset ImageNet --resolution 128 \
--shuffle --num_workers 2 --batch_size 256 \
--num_G_accumulations 1 --num_D_accumulations 1 \
--num_D_steps 2 --G_lr 5e-5 --D_lr 2e-4 --D_B2 0.999 --G_B2 0.999 \
--G_attn 64 --D_attn 64 \
--G_nl inplace_relu --D_nl inplace_relu \
--SN_eps 1e-6 --BN_eps 1e-5 --adam_eps 1e-6 \
--G_ortho 0.0 \
--G_shared \
--G_init ortho --D_init ortho \
--hier --dim_z 120 --shared_dim 128 \
--G_eval_mode \
--G_ch 96 --D_ch 96 \
--ema --use_ema --ema_start 20000 \
--test_every 2000 --save_every 500 --num_best_copies 5 --num_save_copies 2 --seed 0 \
--multiprocessing-distributed \
--world-size ${N} --dist-url file://${TRAINFILE}
/bin/rm -f set.sh
# kill bg jobs for measuring cpu and gpu
kill -9 $SAR_PID
kill -9 $SAR_MEM_PID
kill -9 $SMI_PID