From 96a1793c1fbba9c2ff28d8d2b5de1cc9bd875b2e Mon Sep 17 00:00:00 2001
From: shyhuai <shaohuaishi@gmail.com>
Date: Wed, 19 Jul 2017 14:00:20 +0800
Subject: [PATCH] add collect power script

---
 benchmark.py                                  | 11 +++-
 collect_gpu_power.py                          | 62 +++++++++++++++++++
 configs/v8withinspur/gtx980.config            |  2 +-
 configs/v8withinspur/titanx_pascal.config     |  4 +-
 post_record.py                                |  3 +-
 testing.py                                    |  5 ++
 .../cnn/alexnet/alexnet_imagenet.prototxt     | 20 +++---
 tools/tensorflow/fc/fcn5_mnist.py             |  1 +
 8 files changed, 92 insertions(+), 16 deletions(-)
 create mode 100644 collect_gpu_power.py
 create mode 100644 testing.py
diff --git a/benchmark.py b/benchmark.py
index 2a8148b..f583a1b 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -1,6 +1,8 @@
 import argparse
 import sys,os,time
 import subprocess
+import collect_gpu_power as cgp
+from threading import Thread
 
 
 # Parse arguments
@@ -62,7 +64,7 @@
 	print("Please add -config <path to your config file>")
 	sys.exit(0)
 
-post_flags = " -f " + flag + " -P " + cpu_name + " -A unknown" + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn
+post_flags = " -f " + flag + " -P " + cpu_name + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn
 
 if args.debug:
 	print "[DEBUG] Defalut post flags:" + str(post_flags)
@@ -98,6 +100,7 @@
 		print "\n-------Benchmarking " + tool + " " + exp_args[1] + "-------"
 		log_file += time.ctime()+ "-" + host_name + ".log"
 		log_file = log_file.replace(" ","_")
+		power_log_file = '%s/logs/power_%s' % (root_path, log_file)
 		bm_script = "python " + tool + "bm.py" 
 		bm_script += " -netType " + exp_args[0] + " -log "+log_file+" -batchSize "+exp_args[4]+" -network "+exp_args[1]+" -lr "+exp_args[7]
 		if "-1" in exp_args[2]:
@@ -110,11 +113,15 @@
 			bm_script += " -hostFile " + host_file
 		print bm_script
 		try:
+			thread = Thread(target = cgp.start_collecting_gpu_power, args = (bm_script, power_log_file))
+			thread.start()
 			result_args = subprocess.check_output(bm_script, shell=True).strip().split('\n')[0]
 		except Exception as e:
 			print "Benchmark failed with " + bm_script 
 			os.system("cat " + root_path + "/logs/" + log_file)
 			continue
+        
+		power, mem = cgp.get_average_gpu_power_and_mem(gpu_name, power_log_file)
 		post_flags += " " +  result_args + " -b " + exp_args[4] + " -g " + exp_args[3] + " -e " + exp_args[6] + " -E " + exp_args[5] 
 		post_flags += " -l " + log_file + " -T " + tool + " -n " + exp_args[1] 
 		os.chdir(root_path)
@@ -122,7 +129,7 @@
 			post_script = "python post_record.py " + post_flags
 			print post_script
 			print(subprocess.check_output(post_script, shell=True).strip().split('\n')[0])
-			post_flags = " -f " + flag + " -d " + device_name + " -P " + cpu_name + " -A unknown" + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn
+			post_flags = " -f " + flag + " -d " + device_name + " -P " + cpu_name + " -A " + str(mem) + " -r " + cuda_driver + " -C " + cuda + " -D " + cudnn + " -p " + str(power)
 			post_script = ''
 		else:
 			print "Result:"
diff --git a/collect_gpu_power.py b/collect_gpu_power.py
new file mode 100644
index 0000000..515d4a8
--- /dev/null
+++ b/collect_gpu_power.py
@@ -0,0 +1,62 @@
+import argparse
+import os
+import subprocess
+import time
+import numpy as np
+
+
+def benchmark_is_runing(running_cmd):
+    cmd = 'ps aux| grep \"%s\" | grep -v grep' % running_cmd
+    try:
+        result = subprocess.check_output(cmd, shell=True)
+        if len(result) > 0:
+            return True
+        else:
+            return False
+    except:
+        return False
+    return False 
+
+
+def start_collecting_gpu_power(running_cmd, log_file):
+    cmd = 'nvidia-smi'
+    log = open(log_file, "w")
+    time.sleep(1)
+    while benchmark_is_runing(running_cmd):
+        result = subprocess.check_output(cmd, shell=True)
+        log.write(result)
+        time.sleep(1)
+    log.close()
+
+
+def get_average_gpu_power_and_mem(gpu_name, log_file):
+    log = open(log_file, 'r')
+    content = log.readlines()
+    powers = []
+    mems = []
+    for index, line in enumerate(content):
+        if line.find(gpu_name[len(gpu_name)-3:]) > 0:
+            if index == len(content) - 1:
+                break
+            valid_line = content[index+1].lstrip()
+            items = valid_line.split(' ')
+            for item in items:
+                if item.find('W') > 0:
+                    power = float(item.split('W')[0])
+                    break
+            for item in items:
+                if item.find('MiB') > 0:
+                    memory = float(item.split('MiB')[0])
+                    break
+
+            powers.append(power)
+            mems.append(memory)
+    log.close()
+    return np.mean(powers[2:len(powers)-1]), np.mean(mems[2:len(mems)-1]) 
+
+
+if __name__ == '__main__':
+    #running_cmd = 'python testing.py'
+    #start_collecting_gpu_power(running_cmd, 'debug.log')
+    power, mem = get_average_gpu_power_and_mem('GTX980', 'logs/power_mxnet-fc-fcn5-TitanX_Pascal-devId1-c1-b4096-Thu_Jul__6_10:39:09_2017-hpclgpu.log')
+    print power, mem
diff --git a/configs/v8withinspur/gtx980.config b/configs/v8withinspur/gtx980.config
index 47353d0..f9cc533 100644
--- a/configs/v8withinspur/gtx980.config
+++ b/configs/v8withinspur/gtx980.config
@@ -1,6 +1,6 @@
 flag:		sgbenchmark6v8inspur		#Flag of current experiment
 tools:		caffe,cntk,mxnet,torch,tensorflow			#Tools to benchmark
-#tools:		mxnet			#Tools to benchmark
+#tools:		tensorflow			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <Learning rate>
 {
 		fc;		    fcn5;		2;		1;		4096;		40;		60000;		0.05
diff --git a/configs/v8withinspur/titanx_pascal.config b/configs/v8withinspur/titanx_pascal.config
index 27cf2d6..bf1f74c 100644
--- a/configs/v8withinspur/titanx_pascal.config
+++ b/configs/v8withinspur/titanx_pascal.config
@@ -1,6 +1,6 @@
 flag:		sgbenchmark6v8inspur		#Flag of current experiment
-tools:		caffe,cntk,mxnet,torch,tensorflow			#Tools to benchmark
-#tools:		mxnet			#Tools to benchmark
+#tools:		caffe,cntk,mxnet,torch,tensorflow			#Tools to benchmark
+tools:		tensorflow			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <Learning rate>
 {
 		fc;		    fcn5;		1;		1;		4096;		40;		60000;		0.05
diff --git a/post_record.py b/post_record.py
index 5cdebe1..1a1bf7c 100755
--- a/post_record.py
+++ b/post_record.py
@@ -41,12 +41,13 @@ def post_record(**args):
     parser.add_argument('-C', '--cuda', help='The version of CUDA', default='8.0')
     parser.add_argument('-D', '--cudnn', help='The version of cuDNN', default='5.1')
     parser.add_argument('-r', '--cuda_driver', help='The version of cuda driver', default='367.48')
+    parser.add_argument('-p', '--gpu_power', help='gpu power', default='0')
     parser.add_argument('-v', '--experiment_version', help='The version of running', default='v8')
     p = parser.parse_args()
     object_id = post_record(flag=p.flag, network=p.network, batch_size=p.batch_size, device_name=p.device_name,
                 gpu_count=p.gpu_count, cpu_count=p.cpu_count, cpu_name=p.cpu_name, epoch_size=p.epoch_size, epoch=p.epoch,
                 total_time=p.total_time, average_time=p.average_time, tool_name=p.tool_name, avg_mem=p.average_mem, 
-                epoch_info=p.epoch_info, log_file=p.log_file, cuda=p.cuda, cudnn=p.cudnn, cuda_driver=p.cuda_driver, version=p.experiment_version)
+                epoch_info=p.epoch_info, log_file=p.log_file, cuda=p.cuda, cudnn=p.cudnn, cuda_driver=p.cuda_driver, gpu_power=p.gpu_power, version=p.experiment_version)
     #object_id = post_record(flag='test', network='network')
     print 'post finished, object_id: ', object_id
 
diff --git a/testing.py b/testing.py
new file mode 100644
index 0000000..2ad0199
--- /dev/null
+++ b/testing.py
@@ -0,0 +1,5 @@
+import time
+while(True):
+    print 'hello'
+    time.sleep(2)
+
diff --git a/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt b/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt
index 43501d3..bbce89a 100644
--- a/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt
+++ b/tools/caffe/cnn/alexnet/alexnet_imagenet.prototxt
@@ -7,11 +7,11 @@ layer {
   include {
     phase: TRAIN
   }
-  #transform_param {
-  #  mirror: true
-  #  crop_size: 224
-  #  mean_file: "/home/dataset/caffe/imagenet_mean.binaryproto"
-  #}
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_file: "/home/dataset/caffe/imagenet_mean.binaryproto"
+  }
 # mean pixel / channel-wise mean instead of mean image
   transform_param {
     crop_size: 224
@@ -34,11 +34,11 @@ layer {
   include {
     phase: TEST
   }
-  #transform_param {
-  #  mirror: false
-  #  crop_size: 224
-  #  mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
-  #}
+  transform_param {
+    mirror: false
+    crop_size: 224
+    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+  }
 # mean pixel / channel-wise mean instead of mean image
   transform_param {
     crop_size: 224
diff --git a/tools/tensorflow/fc/fcn5_mnist.py b/tools/tensorflow/fc/fcn5_mnist.py
index fe0b60d..2588c39 100644
--- a/tools/tensorflow/fc/fcn5_mnist.py
+++ b/tools/tensorflow/fc/fcn5_mnist.py
@@ -58,6 +58,7 @@ def get_real_batch_data(batch_size, label_dim):
 
 def train(model='fcn5'):
     config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
+    #config.gpu_options.allow_growth=True
     device_id = FLAGS.device_id
     device_str = ''
     if int(device_id) >= 0: