From 96a1793c1fbba9c2ff28d8d2b5de1cc9bd875b2e Mon Sep 17 00:00:00 2001 From: shyhuai Date: Wed, 19 Jul 2017 14:00:20 +0800 Subject: [PATCH] add collect power script --- benchmark.py | 11 +++- collect_gpu_power.py | 62 +++++++++++++++++++ configs/v8withinspur/gtx980.config | 2 +- configs/v8withinspur/titanx_pascal.config | 4 +- post_record.py | 3 +- testing.py | 5 ++ .../cnn/alexnet/alexnet_imagenet.prototxt | 20 +++--- tools/tensorflow/fc/fcn5_mnist.py | 1 + 8 files changed, 92 insertions(+), 16 deletions(-) create mode 100644 collect_gpu_power.py create mode 100644 testing.py diff --git a/benchmark.py b/benchmark.py index 2a8148b..f583a1b 100644 --- a/benchmark.py +++ b/benchmark.py @@ -1,6 +1,8 @@ import argparse import sys,os,time import subprocess +import collect_gpu_power as cgp +from threading import Thread # Parse arguments @@ -62,7 +64,7 @@ print("Please add -config ") sys.exit(0) -post_flags = " -f " + flag + " -P " + cpu_name + " -A unknown" + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn +post_flags = " -f " + flag + " -P " + cpu_name + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn if args.debug: print "[DEBUG] Defalut post flags:" + str(post_flags) @@ -98,6 +100,7 @@ print "\n-------Benchmarking " + tool + " " + exp_args[1] + "-------" log_file += time.ctime()+ "-" + host_name + ".log" log_file = log_file.replace(" ","_") + power_log_file = '%s/logs/power_%s' % (root_path, log_file) bm_script = "python " + tool + "bm.py" bm_script += " -netType " + exp_args[0] + " -log "+log_file+" -batchSize "+exp_args[4]+" -network "+exp_args[1]+" -lr "+exp_args[7] if "-1" in exp_args[2]: @@ -110,11 +113,15 @@ bm_script += " -hostFile " + host_file print bm_script try: + thread = Thread(target = cgp.start_collecting_gpu_power, args = (bm_script, power_log_file)) + thread.start() result_args = subprocess.check_output(bm_script, shell=True).strip().split('\n')[0] except Exception as e: print "Benchmark failed with " + bm_script os.system("cat " + root_path + "/logs/" + log_file) continue + + power, mem = cgp.get_average_gpu_power_and_mem(gpu_name, power_log_file) post_flags += " " + result_args + " -b " + exp_args[4] + " -g " + exp_args[3] + " -e " + exp_args[6] + " -E " + exp_args[5] post_flags += " -l " + log_file + " -T " + tool + " -n " + exp_args[1] os.chdir(root_path) @@ -122,7 +129,7 @@ post_script = "python post_record.py " + post_flags print post_script print(subprocess.check_output(post_script, shell=True).strip().split('\n')[0]) - post_flags = " -f " + flag + " -d " + device_name + " -P " + cpu_name + " -A unknown" + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn + post_flags = " -f " + flag + " -d " + device_name + " -P " + cpu_name + " -A " + str(mem) + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn + " -p " + str(power) post_script = '' else: print "Result:" diff --git a/collect_gpu_power.py b/collect_gpu_power.py new file mode 100644 index 0000000..515d4a8 --- /dev/null +++ b/collect_gpu_power.py @@ -0,0 +1,62 @@ +import argparse +import os +import subprocess +import time +import numpy as np + + +def benchmark_is_runing(running_cmd): + cmd = 'ps aux| grep \"%s\" | grep -v grep' % running_cmd + try: + result = subprocess.check_output(cmd, shell=True) + if len(result) > 0: + return True + else: + return False + except: + return False + return False + + +def start_collecting_gpu_power(running_cmd, log_file): + cmd = 'nvidia-smi' + log = open(log_file, "w") + time.sleep(1) + while benchmark_is_runing(running_cmd): + result = subprocess.check_output(cmd, shell=True) + log.write(result) + time.sleep(1) + log.close() + + +def get_average_gpu_power_and_mem(gpu_name, log_file): + log = open(log_file, 'r') + content = log.readlines() + powers = [] + mems = [] + for index, line in enumerate(content): + if line.find(gpu_name[len(gpu_name)-3:]) > 0: + if index == len(content) - 1: + break + valid_line = content[index+1].lstrip() + items = valid_line.split(' ') + for item in items: + if item.find('W') > 0: + power = float(item.split('W')[0]) + break + for item in items: + if item.find('MiB') > 0: + memory = float(item.split('MiB')[0]) + break + + powers.append(power) + mems.append(memory) + log.close() + return np.mean(powers[2:len(powers)-1]), np.mean(mems[2:len(mems)-1]) + + +if __name__ == '__main__': + #running_cmd = 'python testing.py' + #start_collecting_gpu_power(running_cmd, 'debug.log') + power, mem = get_average_gpu_power_and_mem('GTX980', 'logs/power_mxnet-fc-fcn5-TitanX_Pascal-devId1-c1-b4096-Thu_Jul__6_10:39:09_2017-hpclgpu.log') + print power, mem diff --git a/configs/v8withinspur/gtx980.config b/configs/v8withinspur/gtx980.config index 47353d0..f9cc533 100644 --- a/configs/v8withinspur/gtx980.config +++ b/configs/v8withinspur/gtx980.config @@ -1,6 +1,6 @@ flag: sgbenchmark6v8inspur #Flag of current experiment tools: caffe,cntk,mxnet,torch,tensorflow #Tools to benchmark -#tools: mxnet #Tools to benchmark +#tools: tensorflow #Tools to benchmark experiments: #; ; ; ; ; ; ; { fc; fcn5; 2; 1; 4096; 40; 60000; 0.05 diff --git a/configs/v8withinspur/titanx_pascal.config b/configs/v8withinspur/titanx_pascal.config index 27cf2d6..bf1f74c 100644 --- a/configs/v8withinspur/titanx_pascal.config +++ b/configs/v8withinspur/titanx_pascal.config @@ -1,6 +1,6 @@ flag: sgbenchmark6v8inspur #Flag of current experiment -tools: caffe,cntk,mxnet,torch,tensorflow #Tools to benchmark -#tools: mxnet #Tools to benchmark +#tools: caffe,cntk,mxnet,torch,tensorflow #Tools to benchmark +tools: tensorflow #Tools to benchmark experiments: #; ; ; ; ; ; ; { fc; fcn5; 1; 1; 4096; 40; 60000; 0.05 diff --git a/post_record.py b/post_record.py index 5cdebe1..1a1bf7c 100755 --- a/post_record.py +++ b/post_record.py @@ -41,12 +41,13 @@ def post_record(**args): parser.add_argument('-C', '--cuda', help='The version of CUDA', default='8.0') parser.add_argument('-D', '--cudnn', help='The version of cuDNN', default='5.1') parser.add_argument('-r', '--cuda_driver', help='The version of cuda driver', default='367.48') + parser.add_argument('-p', '--gpu_power', help='gpu power', default='0') parser.add_argument('-v', '--experiment_version', help='The version of running', default='v8') p = parser.parse_args() object_id = post_record(flag=p.flag, network=p.network, batch_size=p.batch_size, device_name=p.device_name, gpu_count=p.gpu_count, cpu_count=p.cpu_count, cpu_name=p.cpu_name, epoch_size=p.epoch_size, epoch=p.epoch, total_time=p.total_time, average_time=p.average_time, tool_name=p.tool_name, avg_mem=p.average_mem, - epoch_info=p.epoch_info, log_file=p.log_file, cuda=p.cuda, cudnn=p.cudnn, cuda_driver=p.cuda_driver, version=p.experiment_version) + epoch_info=p.epoch_info, log_file=p.log_file, cuda=p.cuda, cudnn=p.cudnn, cuda_driver=p.cuda_driver, gpu_power=p.gpu_power, version=p.experiment_version) #object_id = post_record(flag='test', network='network') print 'post finished, object_id: ', object_id diff --git a/testing.py b/testing.py new file mode 100644 index 0000000..2ad0199 --- /dev/null +++ b/testing.py @@ -0,0 +1,5 @@ +import time +while(True): + print 'hello' + time.sleep(2) + diff --git a/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt b/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt index 43501d3..bbce89a 100644 --- a/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt +++ b/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt @@ -7,11 +7,11 @@ layer { include { phase: TRAIN } - #transform_param { - # mirror: true - # crop_size: 224 - # mean_file: "/home/dataset/caffe/imagenet_mean.binaryproto" - #} + transform_param { + mirror: true + crop_size: 224 + mean_file: "/home/dataset/caffe/imagenet_mean.binaryproto" + } # mean pixel / channel-wise mean instead of mean image transform_param { crop_size: 224 @@ -34,11 +34,11 @@ layer { include { phase: TEST } - #transform_param { - # mirror: false - # crop_size: 224 - # mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" - #} + transform_param { + mirror: false + crop_size: 224 + mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" + } # mean pixel / channel-wise mean instead of mean image transform_param { crop_size: 224 diff --git a/tools/tensorflow/fc/fcn5_mnist.py b/tools/tensorflow/fc/fcn5_mnist.py index fe0b60d..2588c39 100644 --- a/tools/tensorflow/fc/fcn5_mnist.py +++ b/tools/tensorflow/fc/fcn5_mnist.py @@ -58,6 +58,7 @@ def get_real_batch_data(batch_size, label_dim): def train(model='fcn5'): config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) + #config.gpu_options.allow_growth=True device_id = FLAGS.device_id device_str = '' if int(device_id) >= 0: