From 9433de4bbf50c7f0e59765d6c4c8720af1a637b6 Mon Sep 17 00:00:00 2001 From: FreemanX Date: Tue, 28 Feb 2017 18:26:13 +0800 Subject: [PATCH] Doc for adding new tools updated, network configuration checked --- network-configs/NetworksConfigsDoc.txt | 17 +++------ synthetic/scripts/batch-bencmarks.sh | 1 + synthetic/scripts/batch-cpus-gpu10.sh | 12 +++--- synthetic/scripts/batch-cpus-gpu11.sh | 14 +++---- synthetic/scripts/batch-cpus-gpu12.sh | 14 +++---- synthetic/scripts/batch-cpus.sh | 28 +++++++------- tools/Readme.md | 51 ++++++++++++++++++++++---- tools/common/xxxbm.py | 22 +++++++---- 8 files changed, 98 insertions(+), 61 deletions(-) diff --git a/network-configs/NetworksConfigsDoc.txt b/network-configs/NetworksConfigsDoc.txt index 66598f3..dd85ea1 100644 --- a/network-configs/NetworksConfigsDoc.txt +++ b/network-configs/NetworksConfigsDoc.txt @@ -6,17 +6,10 @@ Number of Parameters: 13707264 = 13MB Experiment configs: Learning Rate: 0.05 -Evaluate Step: 2 ----------------------- Batch Size | Epoch ----------------------- - 32 | 40 - ------------------- - 64 | 40 - ------------------- - 128 | 40 - ------------------- - 256 | 40 + 342 | 40 ------------------- 512 | 40 ------------------- @@ -59,7 +52,7 @@ Evaluate Step: 2 Batch Size | Epoch ----------------------- ------------------- - 32 | 40 + 86 | 40 ------------------- 64 | 40 ------------------- @@ -70,6 +63,8 @@ Evaluate Step: 2 512 | 40 ------------------- 1024 | 40 + ------------------- + 2048 | 40 ----------------------- For multiple GPUs, the batch size are set as follows: ---------------- @@ -120,7 +115,7 @@ Evaluate Step: 2 ----------------------- Batch Size | Epoch ----------------------- - 8 | 40 + 11 | 40 ------------------- 16 | 40 ------------------- @@ -129,8 +124,6 @@ Evaluate Step: 2 64 | 40 ------------------- 128 | 40 - ------------------- - 256 | 40 ----------------------- For multiple GPUs, the batch size are set as follows: ---------------- diff --git a/synthetic/scripts/batch-bencmarks.sh b/synthetic/scripts/batch-bencmarks.sh index 048bd1d..047f10b 100755 --- a/synthetic/scripts/batch-bencmarks.sh +++ b/synthetic/scripts/batch-bencmarks.sh @@ -21,6 +21,7 @@ #minibatch=64 iterations=8 epochs=4 device_id=0 network_name=resnet ./cnn-benchmarks.sh # GPU-1 RetNet +sleep 1200 minibatch=8 iterations=8 epochs=4 device_id=1 network_name=resnet ./cnn-benchmarks.sh minibatch=16 iterations=8 epochs=4 device_id=1 network_name=resnet ./cnn-benchmarks.sh minibatch=32 iterations=8 epochs=4 device_id=1 network_name=resnet ./cnn-benchmarks.sh diff --git a/synthetic/scripts/batch-cpus-gpu10.sh b/synthetic/scripts/batch-cpus-gpu10.sh index d94c2d7..0fdcafc 100755 --- a/synthetic/scripts/batch-cpus-gpu10.sh +++ b/synthetic/scripts/batch-cpus-gpu10.sh @@ -1,12 +1,12 @@ # CPU Version -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=1 ./cnn-benchmarks.sh -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=8 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=1 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=8 ./cnn-benchmarks.sh minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=1 ./cnn-benchmarks.sh minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=8 ./cnn-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=1 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=8 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=1 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=8 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=1 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=8 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=1 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=8 ./fc-benchmarks.sh diff --git a/synthetic/scripts/batch-cpus-gpu11.sh b/synthetic/scripts/batch-cpus-gpu11.sh index b2f4ff9..f30153d 100755 --- a/synthetic/scripts/batch-cpus-gpu11.sh +++ b/synthetic/scripts/batch-cpus-gpu11.sh @@ -1,12 +1,12 @@ # CPU Version -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=2 ./cnn-benchmarks.sh -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=16 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=2 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=16 ./cnn-benchmarks.sh inibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=2 ./cnn-benchmarks.sh inibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=16 ./cnn-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=2 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=16 ./fc-benchmarks.sh - -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=2 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=16 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=2 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=16 ./fc-benchmarks.sh +# +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=2 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=16 ./fc-benchmarks.sh diff --git a/synthetic/scripts/batch-cpus-gpu12.sh b/synthetic/scripts/batch-cpus-gpu12.sh index 5765ded..f67ed84 100755 --- a/synthetic/scripts/batch-cpus-gpu12.sh +++ b/synthetic/scripts/batch-cpus-gpu12.sh @@ -1,12 +1,12 @@ # CPU Version -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=4 ./cnn-benchmarks.sh -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=32 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=4 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=32 ./cnn-benchmarks.sh # minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=4 ./cnn-benchmarks.sh minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=32 ./cnn-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=4 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=32 ./fc-benchmarks.sh - -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=4 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=32 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=4 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=32 ./fc-benchmarks.sh +# +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=4 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=32 ./fc-benchmarks.sh diff --git a/synthetic/scripts/batch-cpus.sh b/synthetic/scripts/batch-cpus.sh index 3c6a3e7..1f09208 100755 --- a/synthetic/scripts/batch-cpus.sh +++ b/synthetic/scripts/batch-cpus.sh @@ -1,8 +1,8 @@ # CPU Version -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=1 ./cnn-benchmarks.sh -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=2 ./cnn-benchmarks.sh -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=4 ./cnn-benchmarks.sh -minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=8 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=1 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=2 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=4 ./cnn-benchmarks.sh +#minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=8 ./cnn-benchmarks.sh #minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=16 ./cnn-benchmarks.sh #minibatch=16 iterations=3 epochs=2 device_id=-1 network_name=alexnet OMP_NUM_THREADS=32 ./cnn-benchmarks.sh # @@ -10,20 +10,20 @@ minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=2 ./cnn-benchmarks.sh minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=4 ./cnn-benchmarks.sh minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=8 ./cnn-benchmarks.sh -#minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=16 ./cnn-benchmarks.sh -#minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=32 ./cnn-benchmarks.sh +minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=16 ./cnn-benchmarks.sh +minibatch=16 iterations=2 epochs=3 device_id=-1 network_name=resnet OMP_NUM_THREADS=32 ./cnn-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=1 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=2 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=4 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=8 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=1 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=2 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=4 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=8 ./fc-benchmarks.sh #minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=16 ./fc-benchmarks.sh #minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752 OMP_NUM_THREADS=32 ./fc-benchmarks.sh # -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=1 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=2 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=4 ./fc-benchmarks.sh -minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=8 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=1 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=2 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=4 ./fc-benchmarks.sh +#minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=8 ./fc-benchmarks.sh #minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=16 ./fc-benchmarks.sh #minibatch=64 iterations=8 epochs=4 device_id=-1 network_name=ffn26752l6 OMP_NUM_THREADS=32 ./fc-benchmarks.sh # diff --git a/tools/Readme.md b/tools/Readme.md index 5b992fd..49072ba 100644 --- a/tools/Readme.md +++ b/tools/Readme.md @@ -1,12 +1,47 @@ -## How to add new tools: ## -- 1. Put your tools' scripts directory here and copy the benchmark scrip under common/xxxbm.py (rename it to bm.py) in it. -- 2. Make sure xxxbm.py will take all those parameters pre-defined, you may ignore some of them as long as it works. -- 3. xxxbm.py should print out the running result which will be taken by benchmark.py and post to the server, and the format of the result is: +# How to add new tools # +## Overview +- Put your tools' scripts directory here and copy the benchmark scrip common/xxxbm.py (rename it to \bm.py) and test script testbm.sh in it. +- Make sure \bm.py will take all those parameters pre-defined, you may ignore some of them as long as your scripts work. \bm.py servers as an entry and will be invoked by ../benchmark.py who only cares input parameters and output results of \bm.py. +- All tests in common/testbm.sh should be passed before put new tools into use +- Please create a readme file in your tool's directory including details about tool setup, data preparation, dependencies and environment etc. +## \bm.py explained +### Input options +| Input Argument | Details | Default value | +|:--------------:|:--------------------------------------------------------------------------------------------------:|:------------------------------------------------:| +| -log | Name of log file | You may set your own default value for debugging | +| -batchSize | Batch size for each GPU. If you are using n GPUs for a test, n*batchSize will be fed to the model. | Depend on memory size | +| -network | Name of network, values can be [fcn5 / alexnet / resnet / lstm] | fcn5 | +| -devId | Training device: -1 for CPU; 0 for GPU 0; 0,2 for GPU0 and GPU2. | None but training device must be specified | +| -numEpochs | Number of epochs | 10 | +| -epochSize | Number of training samples for a epoch (not all tools need this parameter) | 50000 | +| -hostFile | Path to the host file if you are running on multiple machines | None* | +| -cpuCount | If devId=-1, you need to specify how may CPUs will be used for training | 1 | +| -gpuCount | If devId != -1, you need to specify how many GPUs will be used | None | +| -lr | Learning rate | None | +| -netType | Network type, values can be [fc / cnn / rnn] | None | +| -debug | Boolean value, true for debugging this script. | False | +###Output +- \bm.py should print out the running result which will be taken by benchmark.py and post to the server, and the format of the result is: ``` -t $totalTimeInSecond -a $averageMiniBatchTimeInSecond -I $lossValueOfEachEpoch ``` -Example of $lossValueOfEachEpoch (There are 4 epochs' item, and splitted by comma, and the 3 values in each item represent current epoch number, test accuracy and cross entropy value respectively.): -``` +Example of *$lossValueOfEachEpoch* (There are 4 epoch items, and splitted by `,`, and 3 values in each item splitted by `:` represents epoch number, accuracy and cross entropy respectively.): +` 0:-:2.32620807,1:-:2.49505453,2:-:2.30122152,3:-:2.30028142 -``` -- 4. All tests in common/testbm.sh should be passed before put it into use +` +###Benchmark procedure +#### 1. Build cmd. +- In order to make this framework be competible with different types of deep learinig tools written in different languages, bm.py is only an interface that standardize the input and output. You need to use arguments above to determine the variable cmd, and it will be executed in a subshell by calling `os.system(cmd)` during which a log file must be genrated containing necessary information for post processing. Some tools will generate a log file automatically, if not redirect all stdout and stderr to the log file. The name of log file ends with ".log". Here are some examples of cmd: +Caffe: `caffe train -solver=fcn5-b1024-GPU-solver1.prototxt -gpu=0 >& /root/dlbench/tools/caffe/fc/debug.log` +Mxnet: `cd fc; python train_mnist.py --lr 0.05 --batch-size 4096 --num-epochs 40 --num-examples 60000 --gpus 1 --kv-store device >& mxnet_debug.log` +Torch: `THC_CACHING_ALLOCATOR=1 CUDA_VISIBLE_DEVICES=1 THC_CACHING_ALLOCATOR=1 th Main.lua -LR 0.05 -dataset MNIST -network ffn5 -batchSize 342 -epoch 40 -logfile torch_debug.log -save torch_debug.log` +#### 2. Execute cmd +- Normally you don't need to change anything in this part. Cmd will be executed and total running time will be measured as well. +#### 3. Post processing +- There're three pieces of information need to be printed out: +**Total runing time** `t`: xxxbm.py handles it already +**Average batch time** `avgBatch`: average training time for one batch of date. Note that if there are more than one GPU, the batch size is n\*args.batchSize since the input argument args.batchSize is for each GPU core. If new tool doesn't measure the batch batch time directly, you need to convert other metrics into seconds/batch here. +**Batch info** `info`: grep test accuracy\* and cross entropy for each epoch from the logfile and form them as \:\:\,\:\:\,... +Then just print out the result: `print "-t " + str(t) + " -a " + str(avgBatch) + " -I " + info` +and save the original log file under \/logs/ +*There's no need to test the model after each epoch for now, but we still leave the space for future use. diff --git a/tools/common/xxxbm.py b/tools/common/xxxbm.py index fd5bb4d..9e5a7c2 100644 --- a/tools/common/xxxbm.py +++ b/tools/common/xxxbm.py @@ -25,6 +25,14 @@ # Build cmd for benchmark cmd = '' +batchSize = 0 +if args.devId is -1: #run on CPU + batchSize = args.batchSize + pass +else: + batchSize = args.batchSize*args.gpuCount + pass + # Execute cmd @@ -35,16 +43,16 @@ if args.debug is True: print("Time diff: " + str(t)) -#Save log file +#Post processing logPath = '' -with open(logPath, "a") as logFile: - logFile.write("Total time: " + str(t) + "\n") - logFile.write("cmd: " + cmd + "\n") -os.system("cp " + logPath + " ../../logs") - -# Parse log file and extract benchmark info avgBatch = (avgEpoch/int(numSamples))*float(batchSize) if args.debug is True: print("Avg Batch: " + str(avgBatch)) info = '' print "-t " + str(t) + " -a " + str(avgBatch) + " -I " + info + +#Save log file +with open(logPath, "a") as logFile: + logFile.write("Total time: " + str(t) + "\n") + logFile.write("cmd: " + cmd + "\n") +os.system("cp " + logPath + " ../../logs")