From 9e60e42a860f5baba6407bd68c197ce334ea68bb Mon Sep 17 00:00:00 2001
From: shyhuai <shaohuaishi@gmail.com>
Date: Tue, 14 Mar 2017 01:10:25 +0800
Subject: [PATCH] revise the setting of number of thread by using MKL

---
 batch-bencmarks-cpu-gpu20.sh                  |   8 +
 batch-bencmarks-gpu-gpu20.sh                  |   3 +
 configs/bm2cpu1.config                        |  10 +-
 configs/bm2cpu16.config                       |  14 +-
 configs/bm2cpu2.config                        |  10 +-
 configs/bm2cpu32.config                       |  10 +-
 configs/bm2cpu4.config                        |  10 +-
 configs/bm2cpu8.config                        |  10 +-
 configs/gpuk80.config                         |  32 +
 post_record.py                                |   2 +-
 .../experiments/cntk/cnn/alexnet/alexnet.cntk |   4 +-
 .../experiments/cntk/cnn/resnet/resnet.cntk   |   4 +-
 synthetic/experiments/cntk/fc/ffn26752.cntk   |   4 +-
 .../tensorflow/cnn/alexnet/alexnetbm.py       |  10 +-
 .../tensorflow/cnn/alexnet/report.txt         |  57 ++
 .../tensorflow/cnn/resnet/report.txt          | 106 +++
 .../tensorflow/cnn/resnet/resnet.py           |  26 +-
 .../tensorflow/cnn/resnet/resnet_train.py     |  20 +-
 .../experiments/tensorflow/fc/ffn26752bm1.py  |  86 +++
 .../experiments/tensorflow/fc/report.txt      |   6 +
 .../experiments/tensorflow/fc/tf_upgrade.py   | 681 ++++++++++++++++++
 .../scripts/batch-bencmarks-gpu-gpu15.sh      |  22 +-
 .../scripts/batch-bencmarks-gpu-gpu20.sh      |  30 +
 synthetic/scripts/cnn-benchmarks.sh           |   5 +-
 synthetic/scripts/fc-benchmarks.sh            |   6 +-
 tools/caffe/caffebm.py                        |   1 +
 tools/cntk/cnn/alexnet/alexnet_cifar10.cntk   |   2 +-
 tools/cntk/cnn/resnet/resnet.cntk             |   2 +-
 tools/cntk/cntkbm.py                          |   1 +
 tools/cntk/fc/fcn5.cntk                       |   2 +-
 tools/cntk/multinodes/fc/Macros.ndl           |  35 -
 tools/cntk/multinodes/fc/fc.sh                |   6 -
 tools/cntk/multinodes/fc/fcn5.cntk            |  78 --
 tools/cntk/multinodes/fc/fcn8.cntk            |  89 ---
 tools/cntk/multinodes/fc/ffn.cntk             |  87 ---
 tools/cntk/rnn/lstm/lstm.cntk                 |   2 +-
 tools/mxnet/mxnetbm.py                        |   1 +
 .../tensorflow/cnn/alexnet/alexnet_cifar10.py |  12 +-
 .../cnn/alexnet/alexnet_cifar10_multi_gpu1.py | 328 +++++++++
 tools/tensorflow/cnn/alexnet/report.txt       |  78 ++
 tools/tensorflow/fc/fcn5_mnist.py             |   4 +-
 tools/tensorflow/fc/fcn5_mnist_multi_gpu1.py  | 228 ++++++
 tools/tensorflow/fc/models.py                 |   2 +-
 tools/tensorflow/fc/report.txt                |  37 +
 tools/tensorflow/tensorflowbm.py              |   1 +
 tools/torch/torchbm.py                        |   7 +-
 46 files changed, 1783 insertions(+), 396 deletions(-)
 create mode 100755 batch-bencmarks-cpu-gpu20.sh
 create mode 100755 batch-bencmarks-gpu-gpu20.sh
 create mode 100644 configs/gpuk80.config
 create mode 100644 synthetic/experiments/tensorflow/cnn/alexnet/report.txt
 create mode 100644 synthetic/experiments/tensorflow/cnn/resnet/report.txt
 create mode 100644 synthetic/experiments/tensorflow/fc/ffn26752bm1.py
 create mode 100644 synthetic/experiments/tensorflow/fc/report.txt
 create mode 100644 synthetic/experiments/tensorflow/fc/tf_upgrade.py
 create mode 100755 synthetic/scripts/batch-bencmarks-gpu-gpu20.sh
 delete mode 100644 tools/cntk/multinodes/fc/Macros.ndl
 delete mode 100644 tools/cntk/multinodes/fc/fc.sh
 delete mode 100644 tools/cntk/multinodes/fc/fcn5.cntk
 delete mode 100644 tools/cntk/multinodes/fc/fcn8.cntk
 delete mode 100644 tools/cntk/multinodes/fc/ffn.cntk
 create mode 100644 tools/tensorflow/cnn/alexnet/alexnet_cifar10_multi_gpu1.py
 create mode 100644 tools/tensorflow/cnn/alexnet/report.txt
 create mode 100644 tools/tensorflow/fc/fcn5_mnist_multi_gpu1.py
 create mode 100644 tools/tensorflow/fc/report.txt

diff --git a/batch-bencmarks-cpu-gpu20.sh b/batch-bencmarks-cpu-gpu20.sh
new file mode 100755
index 0000000..b98fe4a
--- /dev/null
+++ b/batch-bencmarks-cpu-gpu20.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# The benchmarks of all toolkits 
+python benchmark.py -config ./configs/bm2cpu1.config -post True
+python benchmark.py -config ./configs/bm2cpu2.config -post True
+python benchmark.py -config ./configs/bm2cpu4.config -post True
+python benchmark.py -config ./configs/bm2cpu8.config -post True
+python benchmark.py -config ./configs/bm2cpu16.config -post True
+python benchmark.py -config ./configs/bm2cpu32.config -post True
diff --git a/batch-bencmarks-gpu-gpu20.sh b/batch-bencmarks-gpu-gpu20.sh
new file mode 100755
index 0000000..39d42a5
--- /dev/null
+++ b/batch-bencmarks-gpu-gpu20.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# The benchmarks of all toolkits 
+python benchmark.py -config ./configs/gpuk80.config -post True
diff --git a/configs/bm2cpu1.config b/configs/bm2cpu1.config
index bb8203b..11d87e8 100644
--- a/configs/bm2cpu1.config
+++ b/configs/bm2cpu1.config
@@ -1,15 +1,15 @@
 flag:		sgbenchmark6 #Flag of current experiment
-tools:		torch			#Tools to benchmark
+tools:		caffe,cntk,mxnet,tensorflow,torch			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <learning rate>
 {
-#		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
+		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
 		cnn;		alexnet;	-1;		1;		1024;		2;		50000;		0.01
 		cnn;		resnet;		-1;		1;		128;		2;		50000;		0.01
-#		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
+		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
 }
 host_file:	None		#Path to host file or None
-cpu_name:	E5-2630v3	#CPU model
-device_name:	E5-2630v3		#GPU model
+cpu_name:	E5-2630v4	#CPU model
+device_name:	E5-2630v4		#GPU model
 cpu_count:	1		#CPU count for cpu parallel
 cuda:		8.0		#CUDA version
 cudnn:		5.1		#CUDNN version
diff --git a/configs/bm2cpu16.config b/configs/bm2cpu16.config
index 4529bb5..1f4651c 100644
--- a/configs/bm2cpu16.config
+++ b/configs/bm2cpu16.config
@@ -1,19 +1,15 @@
 flag:		sgbenchmark6 #Flag of current experiment
-tools:		torch			#Tools to benchmark
+tools:		caffe,cntk,mxnet,tensorflow,torch			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <learning rate>
 {
-<<<<<<< HEAD
-#		fc;		fcn5;		-1;		1;		1024;		4;		60000;		0.05
-=======
-#		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
->>>>>>> b259c6d55c4beb261f3e7634d50cbb1acdbd4031
+		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
 		cnn;		alexnet;	-1;		1;		1024;		2;		50000;		0.01
 		cnn;		resnet;		-1;		1;		128;		2;		50000;		0.01
-#		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
+		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
 }
 host_file:	None		#Path to host file or None
-cpu_name:	E5-2630v3	#CPU model
-device_name:	E5-2630v3		#GPU model
+cpu_name:	E5-2630v4	#CPU model
+device_name:	E5-2630v4		#GPU model
 cpu_count:	16		#CPU count for cpu parallel
 cuda:		8.0		#CUDA version
 cudnn:		5.1		#CUDNN version
diff --git a/configs/bm2cpu2.config b/configs/bm2cpu2.config
index 7017392..7032f64 100644
--- a/configs/bm2cpu2.config
+++ b/configs/bm2cpu2.config
@@ -1,15 +1,15 @@
 flag:		sgbenchmark6 #Flag of current experiment
-tools:		torch			#Tools to benchmark
+tools:		caffe,cntk,mxnet,tensorflow,torch			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <learning rate>
 {
-#		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
+		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
 		cnn;		alexnet;	-1;		1;		1024;		2;		50000;		0.01
 		cnn;		resnet;		-1;		1;		128;		2;		50000;		0.01
-#		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
+		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
 }
 host_file:	None		#Path to host file or None
-cpu_name:	E5-2630v3	#CPU model
-device_name:	E5-2630v3		#GPU model
+cpu_name:	E5-2630v4	#CPU model
+device_name:	E5-2630v4		#GPU model
 cpu_count:	2		#CPU count for cpu parallel
 cuda:		8.0		#CUDA version
 cudnn:		5.1		#CUDNN version
diff --git a/configs/bm2cpu32.config b/configs/bm2cpu32.config
index 5cafeea..9e88bc0 100644
--- a/configs/bm2cpu32.config
+++ b/configs/bm2cpu32.config
@@ -1,15 +1,15 @@
 flag:		sgbenchmark6 #Flag of current experiment
-tools:		torch			#Tools to benchmark
+tools:		caffe,cntk,mxnet,tensorflow,torch			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <learning rate>
 {
-#		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
+		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
 		cnn;		alexnet;	-1;		1;		1024;		2;		50000;		0.01
 		cnn;		resnet;		-1;		1;		128;		2;		50000;		0.01
-#		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
+		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
 }
 host_file:	None		#Path to host file or None
-cpu_name:	E5-2630v3	#CPU model
-device_name:	E5-2630v3		#GPU model
+cpu_name:	E5-2630v4	#CPU model
+device_name:	E5-2630v4		#GPU model
 cpu_count:	32		#CPU count for cpu parallel
 cuda:		8.0		#CUDA version
 cudnn:		5.1		#CUDNN version
diff --git a/configs/bm2cpu4.config b/configs/bm2cpu4.config
index 4b40114..3bd2b9d 100644
--- a/configs/bm2cpu4.config
+++ b/configs/bm2cpu4.config
@@ -1,15 +1,15 @@
 flag:		sgbenchmark6 #Flag of current experiment
-tools:		torch			#Tools to benchmark
+tools:		caffe,cntk,mxnet,tensorflow,torch			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <learning rate>
 {
-#		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
+		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
 		cnn;		alexnet;	-1;		1;		1024;		2;		50000;		0.01
 		cnn;		resnet;		-1;		1;		128;		2;		50000;		0.01
-#		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
+		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
 }
 host_file:	None		#Path to host file or None
-cpu_name:	E5-2630v3	#CPU model
-device_name:	E5-2630v3		#GPU model
+cpu_name:	E5-2630v4	#CPU model
+device_name:	E5-2630v4		#GPU model
 cpu_count:	4		#CPU count for cpu parallel
 cuda:		8.0		#CUDA version
 cudnn:		5.1		#CUDNN version
diff --git a/configs/bm2cpu8.config b/configs/bm2cpu8.config
index ee32a32..e27a521 100644
--- a/configs/bm2cpu8.config
+++ b/configs/bm2cpu8.config
@@ -1,15 +1,15 @@
 flag:		sgbenchmark6 #Flag of current experiment
-tools:		torch			#Tools to benchmark
+tools:		caffe,cntk,mxnet,tensorflow,torch			#Tools to benchmark
 experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <learning rate>
 {
-#		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
+		fc;		fcn5;		-1;	1;		1024;		4;		60000;		0.05
 		cnn;		alexnet;	-1;		1;		1024;		2;		50000;		0.01
 		cnn;		resnet;		-1;		1;		128;		2;		50000;		0.01
-#		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
+		rnn;		lstm;		-1;		1;		128;		2;		2048;		0.1
 }
 host_file:	None		#Path to host file or None
-cpu_name:	E5-2630v3	#CPU model
-device_name:	E5-2630v3		#GPU model
+cpu_name:	E5-2630v4	#CPU model
+device_name:	E5-2630v4		#GPU model
 cpu_count:	8		#CPU count for cpu parallel
 cuda:		8.0		#CUDA version
 cudnn:		5.1		#CUDNN version
diff --git a/configs/gpuk80.config b/configs/gpuk80.config
new file mode 100644
index 0000000..2e55512
--- /dev/null
+++ b/configs/gpuk80.config
@@ -0,0 +1,32 @@
+flag:		sgbenchmark6		#Flag of current experiment
+tools:		torch			#Tools to benchmark
+experiments: #<network type>; <network name>;  <device id>; <gpu count>;  <batch size>;  <number of epochs>;  <epoch size>; <Learning rate>
+{
+		fc;		    fcn5;		0;		1;		4096;		40;		60000;		0.05
+		fc;		    fcn5;		0;		1;		2048;		40;		60000;		0.05
+		fc;		    fcn5;		0;		1;		1024;		40;		60000;		0.05
+		fc;		    fcn5;		0;		1;		512;	   	40;		60000;		0.05
+		fc;		    fcn5;		0;		1;		342;	   	40;		60000;		0.05
+		cnn;		alexnet;	0;		1;		2048;		40;		50000;		0.01
+		cnn;		alexnet;	0;		1;		1024;		40;		50000;		0.01
+		cnn;		alexnet;	0;		1;		512;  	 	40;		50000;		0.01
+		cnn;		alexnet;	0;		1;		256;	   	40;		50000;		0.01
+		cnn;		alexnet;	0;		1;		128;	   	40;		50000;		0.01
+		cnn;		alexnet;	0;		1;		86;   		40;		50000;		0.01
+		cnn;		resnet;		0;		1;		128;		40;		50000;		0.01
+		cnn;		resnet;		0;		1;		64;		    40;		50000;		0.01
+		cnn;		resnet;		0;		1;		32;		    40;		50000;		0.01
+		cnn;		resnet;		0;		1;		16;		    40;		50000;		0.01
+		cnn;		resnet;		0;		1;		11;		    40;		50000;		0.01
+#		rnn;		lstm;		0;		1;		1024;		20;		-1;	    	0.1
+#		rnn;		lstm;		0;		1;		512;   		20;		-1;	    	0.1
+#		rnn;		lstm;		0;		1;		256;   		20;		-1;	    	0.1
+#		rnn;		lstm;		0;		1;		128;   		20;		-1;	    	0.1
+#		rnn;		lstm;		0;		1;		64;   		20;		-1;	    	0.1
+}
+host_file:	None		#Path to host file or None
+cpu_name:	E5-2630v4	#CPU model
+device_name:	K80		#GPU model
+cuda:		8.0		#CUDA version
+cudnn:		5.1		#CUDNN version
+cuda_driver:	367.48		#CUDA driver version
diff --git a/post_record.py b/post_record.py
index 7ed5efd..a37f4cb 100755
--- a/post_record.py
+++ b/post_record.py
@@ -53,7 +53,7 @@ def post_record(**args):
     object_id = post_record(flag=p.flag, network=p.network, batch_size=p.batch_size, device_name=p.device_name,
                 gpu_count=p.gpu_count, cpu_count=p.cpu_count, cpu_name=p.cpu_name, epoch_size=p.epoch_size, epoch=p.epoch,
                 total_time=p.total_time, average_time=p.average_time, tool_name=p.tool_name, avg_mem=p.average_mem, 
-                epoch_info=p.epoch_info, log_file=p.log_file, cuda=p.cuda, cudnn=p.cudnn, cuda_driver=p.cuda_driver)
+                epoch_info=p.epoch_info, log_file=p.log_file, cuda=p.cuda, cudnn=p.cudnn, cuda_driver=p.cuda_driver, version=p.experiment_version)
     #object_id = post_record(flag='test', network='network')
     print 'post finished, object_id: ', object_id
 
diff --git a/synthetic/experiments/cntk/cnn/alexnet/alexnet.cntk b/synthetic/experiments/cntk/cnn/alexnet/alexnet.cntk
index 08b2f67..7da5905 100644
--- a/synthetic/experiments/cntk/cnn/alexnet/alexnet.cntk
+++ b/synthetic/experiments/cntk/cnn/alexnet/alexnet.cntk
@@ -1,7 +1,7 @@
 WorkDir=.
 ModelDir=$WorkDir$/Output/$ConfigName$
-#DataDir=/home/comp/csshshi/data/cntk
-DataDir=/home/ipdps/data/cntk/synthetic
+DataDir=/home/comp/csshshi/data/cntk
+#DataDir=/home/ipdps/data/cntk/synthetic
 
 ndlMacros=$WorkDir$/Macros.ndl
 
diff --git a/synthetic/experiments/cntk/cnn/resnet/resnet.cntk b/synthetic/experiments/cntk/cnn/resnet/resnet.cntk
index e7aaec2..00504a2 100644
--- a/synthetic/experiments/cntk/cnn/resnet/resnet.cntk
+++ b/synthetic/experiments/cntk/cnn/resnet/resnet.cntk
@@ -2,8 +2,8 @@ RootDir = "."
 
 ConfigDir = "$RootDir$"
 #DataDir = "$RootDir$"
-#DataDir=/home/comp/csshshi/data/cntk
-DataDir=/home/ipdps/data/cntk/synthetic
+DataDir=/home/comp/csshshi/data/cntk
+#DataDir=/home/ipdps/data/cntk/synthetic
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 
diff --git a/synthetic/experiments/cntk/fc/ffn26752.cntk b/synthetic/experiments/cntk/fc/ffn26752.cntk
index 3d1522f..efc165c 100644
--- a/synthetic/experiments/cntk/fc/ffn26752.cntk
+++ b/synthetic/experiments/cntk/fc/ffn26752.cntk
@@ -1,8 +1,8 @@
 WorkDir=.
 ModelDir=$WorkDir$/Output/$ConfigName$
 #stderr=$WorkDir$/logs/$ConfigName$/out
-#DataDir=/home/comp/csshshi/data/cntk
-DataDir=/home/ipdps/data/cntk/synthetic
+DataDir=/home/comp/csshshi/data/cntk
+#DataDir=/home/ipdps/data/cntk/synthetic
 precision=float
 
 deviceId=0
diff --git a/synthetic/experiments/tensorflow/cnn/alexnet/alexnetbm.py b/synthetic/experiments/tensorflow/cnn/alexnet/alexnetbm.py
index 9c5a6a9..d1a7dec 100644
--- a/synthetic/experiments/tensorflow/cnn/alexnet/alexnetbm.py
+++ b/synthetic/experiments/tensorflow/cnn/alexnet/alexnetbm.py
@@ -102,11 +102,11 @@ def loss(logits, labels):
     batch_size = tf.size(labels)
     labels = tf.expand_dims(labels, 1)
     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
-    concated = tf.concat(1, [indices, labels])
+    concated = tf.concat(axis=1, values=[indices, labels])
     onehot_labels = tf.sparse_to_dense(
-        concated, tf.pack([batch_size, 1000]), 1.0, 0.0)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
-                                                            onehot_labels,
+        concated, tf.stack([batch_size, 1000]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
+                                                            labels=onehot_labels,
                                                             name='xentropy')
     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
     return loss
@@ -184,7 +184,7 @@ def run_benchmark():
     last_layer = inference(images)
 
     # Build an initialization operation.
-    init = tf.initialize_all_variables()
+    init = tf.global_variables_initializer()
 
     # Start running operations on the Graph.
     sess = tf.Session(config=config)
diff --git a/synthetic/experiments/tensorflow/cnn/alexnet/report.txt b/synthetic/experiments/tensorflow/cnn/alexnet/report.txt
new file mode 100644
index 0000000..60eb09c
--- /dev/null
+++ b/synthetic/experiments/tensorflow/cnn/alexnet/report.txt
@@ -0,0 +1,57 @@
+--------------------------------------------------------------------------------
+Processing file 'alexnetbm.py'
+ outputting to 'alexnetbm1.py'
+--------------------------------------------------------------------------------
+
+'alexnetbm.py' Line 105
+--------------------------------------------------------------------------------
+
+Added keyword 'concat_dim' to reordered function 'tf.concat'
+Added keyword 'values' to reordered function 'tf.concat'
+
+    Old:     concated = tf.concat(1, [indices, labels])
+                                                        
+    New:     concated = tf.concat(axis=1, values=[indices, labels])
+                                  ~~~~~   ~~~~~~~                   
+
+'alexnetbm.py' Line 187
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.initialize_all_variables' to 'tf.global_variables_initializer'
+
+    Old:     init = tf.initialize_all_variables()
+                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+    New:     init = tf.global_variables_initializer()
+                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+
+'alexnetbm.py' Line 107
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.pack' to 'tf.stack'
+
+    Old:         concated, tf.pack([batch_size, 1000]), 1.0, 0.0)
+                           ~~~~~~~                                
+    New:         concated, tf.stack([batch_size, 1000]), 1.0, 0.0)
+                           ~~~~~~~~                                
+
+'alexnetbm.py' Line 108
+--------------------------------------------------------------------------------
+
+Added keyword 'logits' to reordered function 'tf.nn.softmax_cross_entropy_with_logits'
+
+    Old:     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
+                                                                             
+    New:     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
+                                                                     ~~~~~~~        
+
+'alexnetbm.py' Line 109
+--------------------------------------------------------------------------------
+
+Added keyword 'labels' to reordered function 'tf.nn.softmax_cross_entropy_with_logits'
+
+    Old:                                                             onehot_labels,
+                                                                                    
+    New:                                                             labels=onehot_labels,
+                                                                     ~~~~~~~               
+
+
diff --git a/synthetic/experiments/tensorflow/cnn/resnet/report.txt b/synthetic/experiments/tensorflow/cnn/resnet/report.txt
new file mode 100644
index 0000000..6b1a110
--- /dev/null
+++ b/synthetic/experiments/tensorflow/cnn/resnet/report.txt
@@ -0,0 +1,106 @@
+--------------------------------------------------------------------------------
+Processing file 'resnet_train.py'
+ outputting to 'resnet_train1.py'
+--------------------------------------------------------------------------------
+
+'resnet_train.py' Line 69
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.all_variables' to 'tf.global_variables'
+
+    Old:         saver = tf.train.Saver(tf.all_variables())
+                                        ~~~~~~~~~~~~~~~~    
+    New:         saver = tf.train.Saver(tf.global_variables())
+                                        ~~~~~~~~~~~~~~~~~~~    
+
+'resnet_train.py' Line 71
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.merge_all_summaries' to 'tf.summary.merge_all'
+
+    Old:         summary_op = tf.merge_all_summaries()
+                              ~~~~~~~~~~~~~~~~~~~~~~   
+    New:         summary_op = tf.summary.merge_all()
+                              ~~~~~~~~~~~~~~~~~~~~   
+
+'resnet_train.py' Line 41
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.scalar_summary' to 'tf.summary.scalar'
+
+    Old:         tf.scalar_summary('loss_avg', ema.average(loss_))
+                 ~~~~~~~~~~~~~~~~~                                 
+    New:         tf.summary.scalar('loss_avg', ema.average(loss_))
+                 ~~~~~~~~~~~~~~~~~                                 
+
+'resnet_train.py' Line 47
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.scalar_summary' to 'tf.summary.scalar'
+
+    Old:         tf.scalar_summary('val_top1_error_avg', top1_error_avg)
+                 ~~~~~~~~~~~~~~~~~                                       
+    New:         tf.summary.scalar('val_top1_error_avg', top1_error_avg)
+                 ~~~~~~~~~~~~~~~~~                                       
+
+'resnet_train.py' Line 49
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.scalar_summary' to 'tf.summary.scalar'
+
+    Old:         tf.scalar_summary('learning_rate', FLAGS.learning_rate)
+                 ~~~~~~~~~~~~~~~~~                                       
+    New:         tf.summary.scalar('learning_rate', FLAGS.learning_rate)
+                 ~~~~~~~~~~~~~~~~~                                       
+
+'resnet_train.py' Line 73
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.initialize_all_variables' to 'tf.global_variables_initializer'
+
+    Old:         init = tf.initialize_all_variables()
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+    New:         init = tf.global_variables_initializer()
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+
+'resnet_train.py' Line 55
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.histogram_summary' to 'tf.summary.histogram'
+
+    Old:                 tf.histogram_summary(var.op.name + '/gradients', grad)
+                         ~~~~~~~~~~~~~~~~~~~~                                   
+    New:                 tf.summary.histogram(var.op.name + '/gradients', grad)
+                         ~~~~~~~~~~~~~~~~~~~~                                   
+
+'resnet_train.py' Line 79
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.train.SummaryWriter' to 'tf.summary.FileWriter'
+
+    Old:         summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
+                                  ~~~~~~~~~~~~~~~~~~~~~~                              
+    New:         summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
+                                  ~~~~~~~~~~~~~~~~~~~~~                              
+
+'resnet_train.py' Line 60
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.image_summary' to 'tf.summary.image'
+
+    Old:             tf.image_summary('images', images)
+                     ~~~~~~~~~~~~~~~~                   
+    New:             tf.summary.image('images', images)
+                     ~~~~~~~~~~~~~~~~                   
+
+'resnet_train.py' Line 63
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.histogram_summary' to 'tf.summary.histogram'
+
+    Old:                 tf.histogram_summary(var.op.name, var)
+                         ~~~~~~~~~~~~~~~~~~~~                   
+    New:                 tf.summary.histogram(var.op.name, var)
+                         ~~~~~~~~~~~~~~~~~~~~                   
+
+
diff --git a/synthetic/experiments/tensorflow/cnn/resnet/resnet.py b/synthetic/experiments/tensorflow/cnn/resnet/resnet.py
index ba51d7d..63d6fcb 100644
--- a/synthetic/experiments/tensorflow/cnn/resnet/resnet.py
+++ b/synthetic/experiments/tensorflow/cnn/resnet/resnet.py
@@ -77,7 +77,7 @@ def inference(x, is_training,
         x = stack(x, c)
 
     # post-net
-    x = tf.reduce_mean(x, reduction_indices=[1, 2], name="avg_pool")
+    x = tf.reduce_mean(x, axis=[1, 2], name="avg_pool")
 
     if num_classes != None:
         with tf.variable_scope('fc'):
@@ -127,7 +127,7 @@ def inference_small_config(x, c):
         x = stack(x, c)
 
     # post-net
-    x = tf.reduce_mean(x, reduction_indices=[1, 2], name="avg_pool")
+    x = tf.reduce_mean(x, axis=[1, 2], name="avg_pool")
 
     if c['num_classes'] != None:
         with tf.variable_scope('fc'):
@@ -138,20 +138,20 @@ def inference_small_config(x, c):
 
 def _imagenet_preprocess(rgb):
     """Changes RGB [0,1] valued image to BGR [0,255] with mean subtracted."""
-    red, green, blue = tf.split(3, 3, rgb * 255.0)
-    bgr = tf.concat(3, [blue, green, red])
+    red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb * 255.0)
+    bgr = tf.concat(axis=3, values=[blue, green, red])
     bgr -= IMAGENET_MEAN_BGR
     return bgr
 
 
 def loss(logits, labels):
-    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
     cross_entropy_mean = tf.reduce_mean(cross_entropy)
  
     regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
 
     loss_ = tf.add_n([cross_entropy_mean] + regularization_losses)
-    tf.scalar_summary('loss', loss_)
+    tf.summary.scalar('loss', loss_)
 
     return loss_
 
@@ -231,7 +231,7 @@ def bn(x, c):
 
     if c['use_bias']:
         bias = _get_variable('bias', params_shape,
-                             initializer=tf.zeros_initializer)
+                             initializer=tf.zeros_initializer())
         return x + bias
 
 
@@ -239,18 +239,18 @@ def bn(x, c):
 
     beta = _get_variable('beta',
                          params_shape,
-                         initializer=tf.zeros_initializer)
+                         initializer=tf.zeros_initializer())
     gamma = _get_variable('gamma',
                           params_shape,
-                          initializer=tf.ones_initializer)
+                          initializer=tf.ones_initializer())
 
     moving_mean = _get_variable('moving_mean',
                                 params_shape,
-                                initializer=tf.zeros_initializer,
+                                initializer=tf.zeros_initializer(),
                                 trainable=False)
     moving_variance = _get_variable('moving_variance',
                                     params_shape,
-                                    initializer=tf.ones_initializer,
+                                    initializer=tf.ones_initializer(),
                                     trainable=False)
 
     # These ops will only be preformed when training.
@@ -284,7 +284,7 @@ def fc(x, c):
                             weight_decay=FC_WEIGHT_STDDEV)
     biases = _get_variable('biases',
                            shape=[num_units_out],
-                           initializer=tf.zeros_initializer)
+                           initializer=tf.zeros_initializer())
     x = tf.nn.xw_plus_b(x, weights, biases)
     return x
 
@@ -301,7 +301,7 @@ def _get_variable(name,
         regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
     else:
         regularizer = None
-    collections = [tf.GraphKeys.VARIABLES, RESNET_VARIABLES]
+    collections = [tf.GraphKeys.GLOBAL_VARIABLES, RESNET_VARIABLES]
     return tf.get_variable(name,
                            shape=shape,
                            initializer=initializer,
diff --git a/synthetic/experiments/tensorflow/cnn/resnet/resnet_train.py b/synthetic/experiments/tensorflow/cnn/resnet/resnet_train.py
index 5a98f49..4c5f32c 100644
--- a/synthetic/experiments/tensorflow/cnn/resnet/resnet_train.py
+++ b/synthetic/experiments/tensorflow/cnn/resnet/resnet_train.py
@@ -38,45 +38,45 @@ def train(is_training, logits, images, labels):
         # loss_avg
         ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
         tf.add_to_collection(UPDATE_OPS_COLLECTION, ema.apply([loss_]))
-        tf.scalar_summary('loss_avg', ema.average(loss_))
+        tf.summary.scalar('loss_avg', ema.average(loss_))
 
         # validation stats
         ema = tf.train.ExponentialMovingAverage(0.9, val_step)
         val_op = tf.group(val_step.assign_add(1), ema.apply([top1_error]))
         top1_error_avg = ema.average(top1_error)
-        tf.scalar_summary('val_top1_error_avg', top1_error_avg)
+        tf.summary.scalar('val_top1_error_avg', top1_error_avg)
 
-        tf.scalar_summary('learning_rate', FLAGS.learning_rate)
+        tf.summary.scalar('learning_rate', FLAGS.learning_rate)
 
         opt = tf.train.MomentumOptimizer(FLAGS.learning_rate, MOMENTUM)
         grads = opt.compute_gradients(loss_)
         for grad, var in grads:
             if grad is not None and not FLAGS.minimal_summaries:
-                tf.histogram_summary(var.op.name + '/gradients', grad)
+                tf.summary.histogram(var.op.name + '/gradients', grad)
         apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
 
         if not FLAGS.minimal_summaries:
             # Display the training images in the visualizer.
-            tf.image_summary('images', images)
+            tf.summary.image('images', images)
 
             for var in tf.trainable_variables():
-                tf.histogram_summary(var.op.name, var)
+                tf.summary.histogram(var.op.name, var)
 
         batchnorm_updates = tf.get_collection(UPDATE_OPS_COLLECTION)
         batchnorm_updates_op = tf.group(*batchnorm_updates)
         train_op = tf.group(apply_gradient_op, batchnorm_updates_op)
 
-        saver = tf.train.Saver(tf.all_variables())
+        saver = tf.train.Saver(tf.global_variables())
 
-        summary_op = tf.merge_all_summaries()
+        summary_op = tf.summary.merge_all()
 
-        init = tf.initialize_all_variables()
+        init = tf.global_variables_initializer()
 
         sess = tf.Session(config=config)
         sess.run(init)
         tf.train.start_queue_runners(sess=sess)
 
-        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
+        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
 
         if FLAGS.resume:
             latest = tf.train.latest_checkpoint(FLAGS.train_dir)
diff --git a/synthetic/experiments/tensorflow/fc/ffn26752bm1.py b/synthetic/experiments/tensorflow/fc/ffn26752bm1.py
new file mode 100644
index 0000000..9bfdc22
--- /dev/null
+++ b/synthetic/experiments/tensorflow/fc/ffn26752bm1.py
@@ -0,0 +1,86 @@
+# A feed-forward DNN with 5 hidden layers using sigmoid activations.
+import os
+import time
+import tensorflow as tf
+#import ffn
+import argparse
+
+from ffn26752 import *
+
+device_str = ''
+
+def set_parameters(epochs, minibatch, iterations, device_id):
+    """
+    iterations means the number of iterations in each epoch
+    """
+    global device_str
+    if int(device_id) >= 0:
+        device_str = '/gpu:%d'%int(device_id)
+    else:
+        # cpus
+        device_str = '/cpu:0'
+    global numMinibatches
+    numMinibatches = iterations*epochs
+    #numMinibatches = (138493+minibatch-1)/minibatch * epochs
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--epochs", help="the number of epochs", type=int, default=4)
+    parser.add_argument("-b", "--minibatch", help="minibatch size", type=int, default=128)
+    parser.add_argument("-i", "--iterations", help="iterations", type=int, default=2)
+    parser.add_argument("-d", "--deviceid", help="specified device id", type=int, default=0)
+    args = parser.parse_args()
+    
+    epochs = args.epochs 
+    minibatch = args.minibatch 
+    iterations = args.iterations 
+    device_id = args.deviceid 
+    minibatchSize = args.minibatch
+
+    set_parameters(epochs, minibatch, iterations, device_id)
+    
+    program_start_time = time.time()
+    
+    # Create the model
+    if (FLAGS.noInputFeed):
+      features, labels = getFakeMinibatch(minibatchSize)
+    else:
+      features = tf.placeholder("float", [None, featureDim])
+      labels = tf.placeholder("float", [None, labelDim])
+    config = tf.ConfigProto(allow_soft_placement=True)
+    if device_str.find('cpu') >= 0: # cpu version
+        num_threads = os.getenv('OMP_NUM_THREADS', 1)
+        print 'num_threads: ', num_threads
+        config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=int(num_threads))
+
+
+    with tf.device(device_str):
+        crossEntropy, accuracy = getLossAndAccuracyForSubBatch(features, labels)
+        trainStep = tf.train.GradientDescentOptimizer(0.01).minimize(crossEntropy)
+        
+        # Train
+        #sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.logDevicePlacement, allow_soft_placement=True))
+        sess = tf.Session(config=config)
+        init = tf.global_variables_initializer()
+        sess.run(init)
+        
+        perMinibatchTime = []
+        for i in range(numMinibatches):
+          if (FLAGS.noInputFeed == False):
+            minibatchFeatures, minibatchLabels = getFakeMinibatch(minibatchSize)
+        
+          startTime = time.time()
+          if (FLAGS.noInputFeed):
+            sess.run([trainStep, accuracy])
+          else:
+            sess.run([trainStep, accuracy], feed_dict={features: minibatchFeatures, labels: minibatchLabels})
+        
+          currMinibatchDuration = time.time() - startTime
+          perMinibatchTime.append(currMinibatchDuration)
+        
+        printTrainingStats(1, minibatchSize, perMinibatchTime)
+        
+        program_end_time = time.time()
+        #print('Program finished, Total seconds: %s' % (program_end_time - program_start_time))
diff --git a/synthetic/experiments/tensorflow/fc/report.txt b/synthetic/experiments/tensorflow/fc/report.txt
new file mode 100644
index 0000000..3c69d4b
--- /dev/null
+++ b/synthetic/experiments/tensorflow/fc/report.txt
@@ -0,0 +1,6 @@
+--------------------------------------------------------------------------------
+Processing file 'ffn26752.py'
+ outputting to 'ffn267521.py'
+--------------------------------------------------------------------------------
+
+
diff --git a/synthetic/experiments/tensorflow/fc/tf_upgrade.py b/synthetic/experiments/tensorflow/fc/tf_upgrade.py
new file mode 100644
index 0000000..bcff10f
--- /dev/null
+++ b/synthetic/experiments/tensorflow/fc/tf_upgrade.py
@@ -0,0 +1,681 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts from pre-1.0 TensorFlow to 1.0 TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import ast
+import collections
+import os
+import shutil
+import sys
+import tempfile
+import traceback
+
+
+class APIChangeSpec(object):
+  """List of maps that describe what changed in the API."""
+
+  def __init__(self):
+    # Maps from a function name to a dictionary that describes how to
+    # map from an old argument keyword to the new argument keyword.
+    self.function_keyword_renames = {
+        "tf.count_nonzero": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_all": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_any": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_max": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_mean": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_min": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_prod": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_sum": {
+            "reduction_indices": "axis"
+        },
+        "tf.reduce_logsumexp": {
+            "reduction_indices": "axis"
+        },
+        "tf.expand_dims": {
+            "dim": "axis"
+        },
+        "tf.argmax": {
+            "dimension": "axis"
+        },
+        "tf.argmin": {
+            "dimension": "axis"
+        },
+        "tf.reduce_join": {
+            "reduction_indices": "axis"
+        },
+        "tf.sparse_concat": {
+            "concat_dim": "axis"
+        },
+        "tf.sparse_split": {
+            "split_dim": "axis"
+        },
+        "tf.sparse_reduce_sum": {
+            "reduction_axes": "axis"
+        },
+        "tf.reverse_sequence": {
+            "seq_dim": "seq_axis",
+            "batch_dim": "batch_axis"
+        },
+        "tf.sparse_reduce_sum_sparse": {
+            "reduction_axes": "axis"
+        },
+        "tf.squeeze": {
+            "squeeze_dims": "axis"
+        },
+        "tf.split": {
+            "split_dim": "axis",
+            "num_split": "num_or_size_splits"
+        },
+        "tf.concat": {
+            "concat_dim": "axis"
+        },
+    }
+
+    # Mapping from function to the new name of the function
+    self.function_renames = {
+        "tf.inv": "tf.reciprocal",
+        "tf.contrib.deprecated.scalar_summary": "tf.summary.scalar",
+        "tf.contrib.deprecated.histogram_summary": "tf.summary.histogram",
+        "tf.listdiff": "tf.setdiff1d",
+        "tf.list_diff": "tf.setdiff1d",
+        "tf.mul": "tf.multiply",
+        "tf.neg": "tf.negative",
+        "tf.sub": "tf.subtract",
+        "tf.train.SummaryWriter": "tf.summary.FileWriter",
+        "tf.scalar_summary": "tf.summary.scalar",
+        "tf.histogram_summary": "tf.summary.histogram",
+        "tf.audio_summary": "tf.summary.audio",
+        "tf.image_summary": "tf.summary.image",
+        "tf.merge_summary": "tf.summary.merge",
+        "tf.merge_all_summaries": "tf.summary.merge_all",
+        "tf.image.per_image_whitening": "tf.image.per_image_standardization",
+        "tf.all_variables": "tf.global_variables",
+        "tf.VARIABLES": "tf.GLOBAL_VARIABLES",
+        "tf.initialize_all_variables": "tf.global_variables_initializer",
+        "tf.initialize_variables": "tf.variables_initializer",
+        "tf.initialize_local_variables": "tf.local_variables_initializer",
+        "tf.batch_matrix_diag": "tf.matrix_diag",
+        "tf.batch_band_part": "tf.band_part",
+        "tf.batch_set_diag": "tf.set_diag",
+        "tf.batch_matrix_transpose": "tf.matrix_transpose",
+        "tf.batch_matrix_determinant": "tf.matrix_determinant",
+        "tf.batch_matrix_inverse": "tf.matrix_inverse",
+        "tf.batch_cholesky": "tf.cholesky",
+        "tf.batch_cholesky_solve": "tf.cholesky_solve",
+        "tf.batch_matrix_solve": "tf.matrix_solve",
+        "tf.batch_matrix_triangular_solve": "tf.matrix_triangular_solve",
+        "tf.batch_matrix_solve_ls": "tf.matrix_solve_ls",
+        "tf.batch_self_adjoint_eig": "tf.self_adjoint_eig",
+        "tf.batch_self_adjoint_eigvals": "tf.self_adjoint_eigvals",
+        "tf.batch_svd": "tf.svd",
+        "tf.batch_fft": "tf.fft",
+        "tf.batch_ifft": "tf.ifft",
+        "tf.batch_ifft2d": "tf.ifft2d",
+        "tf.batch_fft3d": "tf.fft3d",
+        "tf.batch_ifft3d": "tf.ifft3d",
+        "tf.select": "tf.where",
+        "tf.complex_abs": "tf.abs",
+        "tf.batch_matmul": "tf.matmul",
+        "tf.pack": "tf.stack",
+        "tf.unpack": "tf.unstack",
+    }
+
+    self.change_to_function = {
+        "tf.ones_initializer",
+        "tf.zeros_initializer",
+    }
+
+    # Functions that were reordered should be changed to the new keyword args
+    # for safety, if positional arguments are used. If you have reversed the
+    # positional arguments yourself, this could do the wrong thing.
+    self.function_reorders = {
+        "tf.split": ["axis", "num_or_size_splits", "value", "name"],
+        "tf.sparse_split": ["axis", "num_or_size_splits", "value", "name"],
+        "tf.concat": ["concat_dim", "values", "name"],
+        "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"],
+        "tf.nn.softmax_cross_entropy_with_logits": [
+            "logits", "labels", "dim", "name"],
+        "tf.nn.sparse_softmax_cross_entropy_with_logits": [
+            "logits", "labels", "name"],
+        "tf.nn.sigmoid_cross_entropy_with_logits": [
+            "logits", "labels", "name"]
+    }
+
+    # Specially handled functions.
+    self.function_handle = {"tf.reverse": self._reverse_handler}
+
+  @staticmethod
+  def _reverse_handler(file_edit_recorder, node):
+    # TODO(aselle): Could check for a literal list of bools and try to convert
+    # them to indices.
+    comment = ("ERROR: tf.reverse has had its argument semantics changed\n"
+               "significantly the converter cannot detect this reliably, so you"
+               "need to inspect this usage manually.\n")
+    file_edit_recorder.add(comment,
+                           node.lineno,
+                           node.col_offset,
+                           "tf.reverse",
+                           "tf.reverse",
+                           error="tf.reverse requires manual check.")
+
+
+class FileEditTuple(collections.namedtuple(
+    "FileEditTuple", ["comment", "line", "start", "old", "new"])):
+  """Each edit that is recorded by a FileEditRecorder.
+
+  Fields:
+    comment: A description of the edit and why it was made.
+    line: The line number in the file where the edit occurs (1-indexed).
+    start: The line number in the file where the edit occurs (0-indexed).
+    old: text string to remove (this must match what was in file).
+    new: text string to add in place of `old`.
+  """
+
+  __slots__ = ()
+
+
+class FileEditRecorder(object):
+  """Record changes that need to be done to the file."""
+
+  def __init__(self, filename):
+    # all edits are lists of chars
+    self._filename = filename
+
+    self._line_to_edit = collections.defaultdict(list)
+    self._errors = []
+
+  def process(self, text):
+    """Process a list of strings, each corresponding to the recorded changes.
+
+    Args:
+      text: A list of lines of text (assumed to contain newlines)
+    Returns:
+      A tuple of the modified text and a textual description of what is done.
+    Raises:
+      ValueError: if substitution source location does not have expected text.
+    """
+
+    change_report = ""
+
+    # Iterate of each line
+    for line, edits in self._line_to_edit.items():
+      offset = 0
+      # sort by column so that edits are processed in order in order to make
+      # indexing adjustments cumulative for changes that change the string
+      # length
+      edits.sort(key=lambda x: x.start)
+
+      # Extract each line to a list of characters, because mutable lists
+      # are editable, unlike immutable strings.
+      char_array = list(text[line - 1])
+
+      # Record a description of the change
+      change_report += "%r Line %d\n" % (self._filename, line)
+      change_report += "-" * 80 + "\n\n"
+      for e in edits:
+        change_report += "%s\n" % e.comment
+      change_report += "\n    Old: %s" % (text[line - 1])
+
+      # Make underscore buffers for underlining where in the line the edit was
+      change_list = [" "] * len(text[line - 1])
+      change_list_new = [" "] * len(text[line - 1])
+
+      # Iterate for each edit
+      for e in edits:
+        # Create effective start, end by accounting for change in length due
+        # to previous edits
+        start_eff = e.start + offset
+        end_eff = start_eff + len(e.old)
+
+        # Make sure the edit is changing what it should be changing
+        old_actual = "".join(char_array[start_eff:end_eff])
+        if old_actual != e.old:
+          raise ValueError("Expected text %r but got %r" %
+                           ("".join(e.old), "".join(old_actual)))
+        # Make the edit
+        char_array[start_eff:end_eff] = list(e.new)
+
+        # Create the underline highlighting of the before and after
+        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
+        change_list_new[start_eff:end_eff] = "~" * len(e.new)
+
+        # Keep track of how to generate effective ranges
+        offset += len(e.new) - len(e.old)
+
+      # Finish the report comment
+      change_report += "         %s\n" % "".join(change_list)
+      text[line - 1] = "".join(char_array)
+      change_report += "    New: %s" % (text[line - 1])
+      change_report += "         %s\n\n" % "".join(change_list_new)
+    return "".join(text), change_report, self._errors
+
+  def add(self, comment, line, start, old, new, error=None):
+    """Add a new change that is needed.
+
+    Args:
+      comment: A description of what was changed
+      line: Line number (1 indexed)
+      start: Column offset (0 indexed)
+      old: old text
+      new: new text
+      error: this "edit" is something that cannot be fixed automatically
+    Returns:
+      None
+    """
+
+    self._line_to_edit[line].append(
+        FileEditTuple(comment, line, start, old, new))
+    if error:
+      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+
+
+class TensorFlowCallVisitor(ast.NodeVisitor):
+  """AST Visitor that finds TensorFlow Function calls.
+
+  Updates function calls from old API version to new API version.
+  """
+
+  def __init__(self, filename, lines):
+    self._filename = filename
+    self._file_edit = FileEditRecorder(filename)
+    self._lines = lines
+    self._api_change_spec = APIChangeSpec()
+
+  def process(self, lines):
+    return self._file_edit.process(lines)
+
+  def generic_visit(self, node):
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def _rename_functions(self, node, full_name):
+    function_renames = self._api_change_spec.function_renames
+    try:
+      new_name = function_renames[full_name]
+      self._file_edit.add("Renamed function %r to %r" % (full_name,
+                                                         new_name),
+                          node.lineno, node.col_offset, full_name, new_name)
+    except KeyError:
+      pass
+
+  def _get_attribute_full_path(self, node):
+    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+
+    Args:
+      node: A Node of type Attribute.
+
+    Returns:
+      a '.'-delimited full-name or None if the tree was not a simple form.
+      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
+    """
+    curr = node
+    items = []
+    while not isinstance(curr, ast.Name):
+      if not isinstance(curr, ast.Attribute):
+        return None
+      items.append(curr.attr)
+      curr = curr.value
+    items.append(curr.id)
+    return ".".join(reversed(items))
+
+  def _find_true_position(self, node):
+    """Return correct line number and column offset for a given node.
+
+    This is necessary mainly because ListComp's location reporting reports
+    the next token after the list comprehension list opening.
+
+    Args:
+      node: Node for which we wish to know the lineno and col_offset
+    """
+    import re
+    find_open = re.compile("^\s*(\\[).*$")
+    find_string_chars = re.compile("['\"]")
+
+    if isinstance(node, ast.ListComp):
+      # Strangely, ast.ListComp returns the col_offset of the first token
+      # after the '[' token which appears to be a bug. Workaround by
+      # explicitly finding the real start of the list comprehension.
+      line = node.lineno
+      col = node.col_offset
+      # loop over lines
+      while 1:
+        # Reverse the text to and regular expression search for whitespace
+        text = self._lines[line-1]
+        reversed_preceding_text = text[:col][::-1]
+        # First find if a [ can be found with only whitespace between it and
+        # col.
+        m = find_open.match(reversed_preceding_text)
+        if m:
+          new_col_offset = col - m.start(1) - 1
+          return line, new_col_offset
+        else:
+          if (reversed_preceding_text=="" or
+             reversed_preceding_text.isspace()):
+            line = line - 1
+            prev_line = self._lines[line - 1]
+            # TODO(aselle):
+            # this is poor comment detection, but it is good enough for
+            # cases where the comment does not contain string literal starting/
+            # ending characters. If ast gave us start and end locations of the
+            # ast nodes rather than just start, we could use string literal
+            # node ranges to filter out spurious #'s that appear in string
+            # literals.
+            comment_start = prev_line.find("#")
+            if comment_start ==  -1:
+              col = len(prev_line) -1
+            elif find_string_chars.search(prev_line[comment_start:]) is None:
+              col = comment_start
+            else:
+              return None, None
+          else:
+            return None, None
+    # Most other nodes return proper locations (with notably does not), but
+    # it is not possible to use that in an argument.
+    return node.lineno, node.col_offset
+
+
+  def visit_Call(self, node):  # pylint: disable=invalid-name
+    """Handle visiting a call node in the AST.
+
+    Args:
+      node: Current Node
+    """
+
+
+    # Find a simple attribute name path e.g. "tf.foo.bar"
+    full_name = self._get_attribute_full_path(node.func)
+
+    # Make sure the func is marked as being part of a call
+    node.func.is_function_for_call = True
+
+    if full_name and full_name.startswith("tf."):
+      # Call special handlers
+      function_handles = self._api_change_spec.function_handle
+      if full_name in function_handles:
+        function_handles[full_name](self._file_edit, node)
+
+      # Examine any non-keyword argument and make it into a keyword argument
+      # if reordering required.
+      function_reorders = self._api_change_spec.function_reorders
+      function_keyword_renames = (
+          self._api_change_spec.function_keyword_renames)
+
+      if full_name in function_reorders:
+        reordered = function_reorders[full_name]
+        for idx, arg in enumerate(node.args):
+          lineno, col_offset = self._find_true_position(arg)
+          if lineno is None or col_offset is None:
+            self._file_edit.add(
+                "Failed to add keyword %r to reordered function %r"
+                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
+                "", "",
+                error="A necessary keyword argument failed to be inserted.")
+          else:
+            keyword_arg = reordered[idx]
+            if (full_name in function_keyword_renames and
+                keyword_arg in function_keyword_renames[full_name]):
+              keyword_arg = function_keyword_renames[full_name][keyword_arg]
+            self._file_edit.add("Added keyword %r to reordered function %r"
+                                % (reordered[idx], full_name), lineno,
+                                col_offset, "", keyword_arg + "=")
+
+      # Examine each keyword argument and convert it to the final renamed form
+      renamed_keywords = ({} if full_name not in function_keyword_renames else
+                          function_keyword_renames[full_name])
+      for keyword in node.keywords:
+        argkey = keyword.arg
+        argval = keyword.value
+
+        if argkey in renamed_keywords:
+          argval_lineno, argval_col_offset = self._find_true_position(argval)
+          if (argval_lineno is not None and argval_col_offset is not None):
+            # TODO(aselle): We should scan backward to find the start of the
+            # keyword key. Unfortunately ast does not give you the location of
+            # keyword keys, so we are forced to infer it from the keyword arg
+            # value.
+            key_start = argval_col_offset - len(argkey) - 1
+            key_end = key_start + len(argkey) + 1
+            if self._lines[argval_lineno - 1][key_start:key_end] == argkey + "=":
+              self._file_edit.add("Renamed keyword argument from %r to %r" %
+                              (argkey, renamed_keywords[argkey]),
+                              argval_lineno,
+                              argval_col_offset - len(argkey) - 1,
+                              argkey + "=", renamed_keywords[argkey] + "=")
+              continue
+          self._file_edit.add(
+              "Failed to rename keyword argument from %r to %r" %
+              (argkey, renamed_keywords[argkey]),
+              argval.lineno,
+              argval.col_offset - len(argkey) - 1,
+              "", "",
+              error="Failed to find keyword lexographically. Fix manually.")
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+
+    Args:
+      node: Node that is of type ast.Attribute
+    """
+    full_name = self._get_attribute_full_path(node)
+    if full_name and full_name.startswith("tf."):
+      self._rename_functions(node, full_name)
+    if full_name in self._api_change_spec.change_to_function:
+      if not hasattr(node, "is_function_for_call"):
+        new_text = full_name + "()"
+        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+                            node.lineno, node.col_offset, full_name, new_text)
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+
+class TensorFlowCodeUpgrader(object):
+  """Class that handles upgrading a set of Python files to TensorFlow 1.0."""
+
+  def __init__(self):
+    pass
+
+  def process_file(self, in_filename, out_filename):
+    """Process the given python file for incompatible changes.
+
+    Args:
+      in_filename: filename to parse
+      out_filename: output file to write to
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+
+    # Write to a temporary file, just in case we are doing an implace modify.
+    with open(in_filename, "r") as in_file, \
+        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+      ret = self.process_opened_file(
+          in_filename, in_file, out_filename, temp_file)
+
+    shutil.move(temp_file.name, out_filename)
+    return ret
+
+  # Broad exceptions are required here because ast throws whatever it wants.
+  # pylint: disable=broad-except
+  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
+    """Process the given python file for incompatible changes.
+
+    This function is split out to facilitate StringIO testing from
+    tf_upgrade_test.py.
+
+    Args:
+      in_filename: filename to parse
+      in_file: opened file (or StringIO)
+      out_filename: output file to write to
+      out_file: opened file (or StringIO)
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+    process_errors = []
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+
+    parsed_ast = None
+    lines = in_file.readlines()
+    try:
+      parsed_ast = ast.parse("".join(lines))
+    except Exception:
+      text += "Failed to parse %r\n\n" % in_filename
+      text += traceback.format_exc()
+    if parsed_ast:
+      visitor = TensorFlowCallVisitor(in_filename, lines)
+      visitor.visit(parsed_ast)
+      out_text, new_text, process_errors = visitor.process(lines)
+      text += new_text
+      if out_file:
+        out_file.write(out_text)
+    text += "\n"
+    return 1, text, process_errors
+  # pylint: enable=broad-except
+
+  def process_tree(self, root_directory, output_root_directory):
+    """Processes upgrades on an entire tree of python files in place.
+
+    Note that only Python files. If you have custom code in other languages,
+    you will need to manually upgrade those.
+
+    Args:
+      root_directory: Directory to walk and process.
+      output_root_directory: Directory to use as base
+    Returns:
+      A tuple of files processed, the report string ofr all files, and errors
+    """
+
+    # make sure output directory doesn't exist
+    if output_root_directory and os.path.exists(output_root_directory):
+      print("Output directory %r must not already exist." % (
+          output_root_directory))
+      sys.exit(1)
+
+    # make sure output directory does not overlap with root_directory
+    norm_root = os.path.split(os.path.normpath(root_directory))
+    norm_output = os.path.split(os.path.normpath(output_root_directory))
+    if norm_root == norm_output:
+      print("Output directory %r same as input directory %r" % (
+          root_directory, output_root_directory))
+      sys.exit(1)
+
+    # Collect list of files to process (we do this to correctly handle if the
+    # user puts the output directory in some sub directory of the input dir)
+    files_to_process = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [f for f in file_list if f.endswith(".py")]
+      for filename in py_files:
+        fullpath = os.path.join(dir_name, filename)
+        fullpath_output = os.path.join(
+            output_root_directory, os.path.relpath(fullpath, root_directory))
+        files_to_process.append((fullpath, fullpath_output))
+
+    file_count = 0
+    tree_errors = []
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for input_path, output_path in files_to_process:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      file_count += 1
+      _, l_report, l_errors = self.process_file(input_path, output_path)
+      tree_errors += l_errors
+      report += l_report
+    return file_count, report, tree_errors
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description="""Convert a TensorFlow Python file to 1.0
+
+Simple usage:
+  tf_convert.py --infile foo.py --outfile bar.py
+  tf_convert.py --intree ~/code/old --outtree ~/code/new
+""")
+  parser.add_argument(
+      "--infile",
+      dest="input_file",
+      help="If converting a single file, the name of the file "
+      "to convert")
+  parser.add_argument(
+      "--outfile",
+      dest="output_file",
+      help="If converting a single file, the output filename.")
+  parser.add_argument(
+      "--intree",
+      dest="input_tree",
+      help="If converting a whole tree of files, the directory "
+      "to read from (relative or absolute).")
+  parser.add_argument(
+      "--outtree",
+      dest="output_tree",
+      help="If converting a whole tree of files, the output "
+      "directory (relative or absolute).")
+  parser.add_argument(
+      "--reportfile",
+      dest="report_filename",
+      help=("The name of the file where the report log is "
+            "stored."
+            "(default: %(default)s)"),
+      default="report.txt")
+  args = parser.parse_args()
+
+  upgrade = TensorFlowCodeUpgrader()
+  report_text = None
+  report_filename = args.report_filename
+  files_processed = 0
+  if args.input_file:
+    files_processed, report_text, errors = upgrade.process_file(
+        args.input_file, args.output_file)
+    files_processed = 1
+  elif args.input_tree:
+    files_processed, report_text, errors = upgrade.process_tree(
+        args.input_tree, args.output_tree)
+  else:
+    parser.print_help()
+  if report_text:
+    open(report_filename, "w").write(report_text)
+    print("TensorFlow 1.0 Upgrade Script")
+    print("-----------------------------")
+    print("Converted %d files\n" % files_processed)
+    print("Detected %d errors that require attention" % len(errors))
+    print("-" * 80)
+    print("\n".join(errors))
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
diff --git a/synthetic/scripts/batch-bencmarks-gpu-gpu15.sh b/synthetic/scripts/batch-bencmarks-gpu-gpu15.sh
index 856f57b..dd3a942 100755
--- a/synthetic/scripts/batch-bencmarks-gpu-gpu15.sh
+++ b/synthetic/scripts/batch-bencmarks-gpu-gpu15.sh
@@ -2,25 +2,25 @@
 # The benchmarks of all toolkits 
 
 # GPU-0 AlexNet 
-#minibatch=16    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
-#minibatch=32    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
-#minibatch=64    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
-#minibatch=128   iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=16    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=32    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=64    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=128   iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
 #
 ## GPU-0 RetNet 
-#minibatch=8     iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
-#minibatch=16    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
-#minibatch=32    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
-#minibatch=64    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=8     iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=16    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=32    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=64    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
 #
 
 # GPU-0 Fully Connected: FFN26752 
 minibatch=32    iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
 minibatch=64    iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
 minibatch=128   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
-#minibatch=256   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
-#minibatch=512   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
-#minibatch=1024  iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=256   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=512   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=1024  iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
 
 ## GPU-0 Fully Connected: FFN26752 6 Hidden Layers 
 #minibatch=256   iterations=8    epochs=4     device_id=0    network_name=ffn26752l6   ./fc-benchmarks.sh
diff --git a/synthetic/scripts/batch-bencmarks-gpu-gpu20.sh b/synthetic/scripts/batch-bencmarks-gpu-gpu20.sh
new file mode 100755
index 0000000..dd3a942
--- /dev/null
+++ b/synthetic/scripts/batch-bencmarks-gpu-gpu20.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# The benchmarks of all toolkits 
+
+# GPU-0 AlexNet 
+minibatch=16    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=32    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=64    iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+minibatch=128   iterations=8    epochs=4    device_id=0     network_name=alexnet    ./cnn-benchmarks.sh
+#
+## GPU-0 RetNet 
+minibatch=8     iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=16    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=32    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+minibatch=64    iterations=8    epochs=4    device_id=0     network_name=resnet     ./cnn-benchmarks.sh
+#
+
+# GPU-0 Fully Connected: FFN26752 
+minibatch=32    iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=64    iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=128   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=256   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=512   iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+minibatch=1024  iterations=8    epochs=4     device_id=0    network_name=ffn26752   ./fc-benchmarks.sh
+
+## GPU-0 Fully Connected: FFN26752 6 Hidden Layers 
+#minibatch=256   iterations=8    epochs=4     device_id=0    network_name=ffn26752l6   ./fc-benchmarks.sh
+#minibatch=512   iterations=8    epochs=4     device_id=0    network_name=ffn26752l6   ./fc-benchmarks.sh
+#minibatch=1024  iterations=8    epochs=4     device_id=0    network_name=ffn26752l6   ./fc-benchmarks.sh
+#minibatch=2048  iterations=8    epochs=4     device_id=0    network_name=ffn26752l6   ./fc-benchmarks.sh
+#minibatch=4096  iterations=8    epochs=4     device_id=0    network_name=ffn26752l6   ./fc-benchmarks.sh
diff --git a/synthetic/scripts/cnn-benchmarks.sh b/synthetic/scripts/cnn-benchmarks.sh
index 13e1e8c..35d0ac5 100755
--- a/synthetic/scripts/cnn-benchmarks.sh
+++ b/synthetic/scripts/cnn-benchmarks.sh
@@ -4,8 +4,9 @@
 ###########
 # CNN
 ###########
-REPO_HOME=/home/comp/pengfeixu/dlbench/synthetic
+#REPO_HOME=/home/comp/pengfeixu/dlbench/synthetic
 #REPO_HOME=/home/ipdps/dpBenchmark/synthetic
+REPO_HOME=/home/comp/csshshi/repositories/dpBenchmark/synthetic
 current_path=$REPO_HOME/scripts
 experiments_path=$REPO_HOME/experiments
 log_path=$REPO_HOME/logs
@@ -36,7 +37,7 @@ hostName=`hostname`
 
 
 #tools=( "caffe" "cntk" "tensorflow" "torch" )
-tools=( "mxnet" )
+tools=( "mxnet" "cntk")
 benchmark_logfile=${current_path}/${network_type}-${network_name}-gpu${device_id}.bm
 echo -e 'GPU:'${device_id}'\nNUM_THREADS (for CPU): '${OMP_NUM_THREADS}'\nNetwork: '${network_name}'\nEpochs: '${epochs}'\nMinibatch: '${minibatch}'\nIterations: '${iterations}'\nBenchmark Time: '${running_time}'\n_________________\n'>> ${benchmark_logfile}
 echo -e 'ToolName\t\t\tAverageTime(s)'>>${benchmark_logfile}
diff --git a/synthetic/scripts/fc-benchmarks.sh b/synthetic/scripts/fc-benchmarks.sh
index e18381f..348993e 100755
--- a/synthetic/scripts/fc-benchmarks.sh
+++ b/synthetic/scripts/fc-benchmarks.sh
@@ -4,8 +4,8 @@
 ###########
 # CNN
 ###########
-REPO_HOME=/home/ipdps/dpBenchmark/synthetic
-#REPO_HOME=/home/comp/csshshi/repositories/dpBenchmark/synthetic
+#REPO_HOME=/home/ipdps/dpBenchmark/synthetic
+REPO_HOME=/home/comp/csshshi/repositories/dpBenchmark/synthetic
 current_path=$REPO_HOME/scripts
 experiments_path=$REPO_HOME/experiments
 log_path=$REPO_HOME/logs
@@ -36,7 +36,7 @@ hostName=`hostname`
 
 #tools=( "caffe" "cntk" "dsstne" "tensorflow" "torch" )
 #tools=( "caffe" "cntk" "tensorflow" "torch" ) #cpu versions, exclude dsstne
-tools=( "cntk" )
+tools=( "cntk" "mxnet")
 benchmark_logfile=${current_path}/${network_type}-${network_name}-gpu${device_id}.bm
 echo -e 'GPU:'${device_id}'\nNUM_THREADS (for CPU): '${OMP_NUM_THREADS}'\nNetwork: '${network_name}'\nEpochs: '${epochs}'\nMinibatch: '${minibatch}'\nIterations: '${iterations}'\nBenchmark Time: '${running_time}'\n_________________\n'>> ${benchmark_logfile}
 echo -e 'ToolName\t\t\tAverageTime(s)'>>${benchmark_logfile}
diff --git a/tools/caffe/caffebm.py b/tools/caffe/caffebm.py
index 5826c59..c608d70 100644
--- a/tools/caffe/caffebm.py
+++ b/tools/caffe/caffebm.py
@@ -25,6 +25,7 @@
 # Set system variable
 os.environ['OMP_NUM_THREADS'] = args.cpuCount 
 os.environ['OPENBLAS_NUM_THREADS'] = args.cpuCount 
+os.environ['MKL_NUM_THREADS'] = args.cpuCount 
 
 # Build cmd for benchmark
 root_path = os.path.dirname(os.path.abspath(__file__))
diff --git a/tools/cntk/cnn/alexnet/alexnet_cifar10.cntk b/tools/cntk/cnn/alexnet/alexnet_cifar10.cntk
index efd6162..55fee8e 100644
--- a/tools/cntk/cnn/alexnet/alexnet_cifar10.cntk
+++ b/tools/cntk/cnn/alexnet/alexnet_cifar10.cntk
@@ -1,7 +1,7 @@
 WorkDir=.
 OutputDir = "$WorkDir$/Output"
 ModelDir = "$OutputDir$/Models"
-DataDir = "/home/ipdps/data/cntk/cifar10"
+DataDir = "/home/comp/csshshi/data/cntk/cifar10"
 #DataDir = "/home/comp/pengfeixu/Data/cntk/cifar10"
 
 precision=float
diff --git a/tools/cntk/cnn/resnet/resnet.cntk b/tools/cntk/cnn/resnet/resnet.cntk
index 6f6a451..2a00deb 100644
--- a/tools/cntk/cnn/resnet/resnet.cntk
+++ b/tools/cntk/cnn/resnet/resnet.cntk
@@ -1,7 +1,7 @@
 RootDir = "."
 
 ConfigDir = "$RootDir$"
-DataDir = "/home/ipdps/data/cntk/cifar10"
+DataDir = "/home/comp/csshshi/data/cntk/cifar10"
 #DataDir = "/home/comp/pengfeixu/Data/cntk/cifar10"
 #DataDir = "/home/ipdps/Data/cntk/cifar10"
 OutputDir = "$RootDir$/Output"
diff --git a/tools/cntk/cntkbm.py b/tools/cntk/cntkbm.py
index c782685..e5f2422 100644
--- a/tools/cntk/cntkbm.py
+++ b/tools/cntk/cntkbm.py
@@ -26,6 +26,7 @@
 # Set system variable
 os.environ['OMP_NUM_THREADS'] = args.cpuCount 
 os.environ['OPENBLAS_NUM_THREADS'] = args.cpuCount 
+os.environ['MKL_NUM_THREADS'] = args.cpuCount 
 
 # Build cmd for benchmark
 root_path = os.path.dirname(os.path.abspath(__file__))
diff --git a/tools/cntk/fc/fcn5.cntk b/tools/cntk/fc/fcn5.cntk
index db6a3c8..9246a37 100644
--- a/tools/cntk/fc/fcn5.cntk
+++ b/tools/cntk/fc/fcn5.cntk
@@ -1,7 +1,7 @@
 WorkDir= "."
 ConfigDir= "."
 ModelDir=$WorkDir$/Output
-DataDir=/home/ipdps/data/cntk/mnist
+DataDir=/home/comp/csshshi/data/cntk/mnist
 #DataDir=/home/comp/pengfeixu/Data/cntk/mnist
 #ndlMacros = "$ConfigDir$/Macros.ndl"
 precision=float
diff --git a/tools/cntk/multinodes/fc/Macros.ndl b/tools/cntk/multinodes/fc/Macros.ndl
deleted file mode 100644
index f6e5ee0..0000000
--- a/tools/cntk/multinodes/fc/Macros.ndl
+++ /dev/null
@@ -1,35 +0,0 @@
-ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
-{
-    convW = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    # conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout="cudnn")
-    convB = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    convPlusB = Plus(conv, convB);
-    act = RectifiedLinear(convPlusB);
-}
-
-#ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
-#[
-#    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-#    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
-#    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-#    z = Plus(c, b);
-#    y = RectifiedLinear(z);
-#]
-
-DNNReLULayer(inDim, outDim, x, wScale, bValue)
-{
-    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
-    b = Parameter(outDim, init = fixedValue, value = bValue) 
-    t = Times(W, x)
-    z = Plus(t, b)
-    y = RectifiedLinear(z)
-}
-
-DNNLastLayer(hiddenDim, labelDim, x, wScale, bValue)
-{
-    W = Parameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
-    b = Parameter(labelDim, init = fixedValue, value = bValue)
-    t = Times(W, x)
-    z = Plus(t, b)
-}
diff --git a/tools/cntk/multinodes/fc/fc.sh b/tools/cntk/multinodes/fc/fc.sh
deleted file mode 100644
index 1b61586..0000000
--- a/tools/cntk/multinodes/fc/fc.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-start=`date +%s.%N`
-cntk configFile=ffn26752.cntk configName=ffn >1GPU.log 2>&1
-end=`date +%s.%N`
-runtime=$( echo "$end - $start" | bc -l )
-echo "finished with execute time: ${runtime}" >>1GPU.log
-
diff --git a/tools/cntk/multinodes/fc/fcn5.cntk b/tools/cntk/multinodes/fc/fcn5.cntk
deleted file mode 100644
index 043ea80..0000000
--- a/tools/cntk/multinodes/fc/fcn5.cntk
+++ /dev/null
@@ -1,78 +0,0 @@
-WorkDir=.
-ModelDir=$WorkDir$/Output/$ConfigName$
-#stderr=$WorkDir$/logs/$ConfigName$/out
-DataDir=/home/dl/data/cntk
-precision=float
-
-deviceId=0
-minibatchSize=1024
-epochSize=4096
-maxEpochs=4
-
-makeMode=false
-
-command=Train
-
-featureDim = 26752 
-labelDim = 26752 
-hiddenDim = 2048
-
-initOnCPUOnly=true
-parallelTrain=true
-prefetch=true
-
-Train=[
-    action=train
-    modelPath=$ModelDir$/fc26752
-    traceLevel=1
-
-    SimpleNetworkBuilder=[
-        layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
-        initOnCPUOnly=true
-        trainingCriterion=CrossEntropyWithSoftmax
-        evalCriterion=ErrorPrediction
-        layerTypes=Sigmoid
-        applyMeanVarNorm=false
-        initValueScale=1.0
-        uniformInit=true
-        needPrior=false
-    ]
-    
-    SGD=[
-        epochSize=$epochSize$
-        minibatchSize=$minibatchSize$
-        maxEpochs=$maxEpochs$
-        learningRatesPerMB=0.01
-        numMBsToShowResult=4
-        momentumPerSample=0
-        dropoutRate=0.0
-
-        ParallelTrain=[
-            parallelizationMethod=DataParallelSGD
-            distributedMBReading=true
-            parallelizationStartEpoch=1
-            DataParallelSGD=[
-                gradientBits=1
-            ]
-        ]
-
-        gradUpdateType=None
-        normWithAveMultiplier=true
-        clippingThresholdPerSample=1#INF
-    ]
-]
-
-reader=[
-    readerType=UCIFastReader
-    file=$DataDir$/data26752_4k.txt
-    features=[
-        dim=$featureDim$
-        start=1
-    ]
-    labels=[
-        dim=1
-        start=0
-        labelDim=$labelDim$
-        labelMappingFile=$DataDir$/labelmap26752.txt
-    ]
-]
diff --git a/tools/cntk/multinodes/fc/fcn8.cntk b/tools/cntk/multinodes/fc/fcn8.cntk
deleted file mode 100644
index 069f877..0000000
--- a/tools/cntk/multinodes/fc/fcn8.cntk
+++ /dev/null
@@ -1,89 +0,0 @@
-WorkDir=.
-ModelDir=$WorkDir$/Output/$ConfigName$
-#stderr=$WorkDir$/logs/$ConfigName$/out
-DataDir=/home/dl/data/cntk
-precision=float
-
-deviceId=0
-minibatchSize=1024
-epochSize=4096
-maxEpochs=2
-
-makeMode=false
-
-command=Train
-
-featureDim = 26752 
-labelDim = 26752 
-hiddenDim = 2048
-
-initOnCPUOnly=true
-parallelTrain=false
-prefetch=true
-
-Train=[
-    action=train
-    modelPath=$ModelDir$/fc26752l6
-    #deviceId=1
-    traceLevel=1
-
-    SimpleNetworkBuilder=[
-        #layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
-        layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
-        initOnCPUOnly=true
-        trainingCriterion=CrossEntropyWithSoftmax
-        evalCriterion=ErrorPrediction
-        layerTypes=Sigmoid
-        applyMeanVarNorm=false
-        initValueScale=1.0
-        uniformInit=true
-        needPrior=false
-    ]
-    
-    SGD=[
-        epochSize=$epochSize$
-        minibatchSize=$minibatchSize$
-        maxEpochs=$maxEpochs$
-        learningRatesPerMB=0.01
-        numMBsToShowResult=4
-        momentumPerSample=0
-        dropoutRate=0.0
-
-        #epochSize=4096
-        #minibatchSize=256
-        #maxEpochs=2
-        #learningRatesPerMB=0.01
-        #numMBsToShowResult=4
-        #momentumPerSample=0
-        #dropoutRate=0.0
-        
-        ParallelTrain=[
-            parallelizationMethod=DataParallelSGD
-            distributedMBReading=true
-            parallelizationStartEpoch=1
-            DataParallelSGD=[
-                gradientBits=1
-            ]
-        ]
-
-        gradUpdateType=None
-        normWithAveMultiplier=true
-        clippingThresholdPerSample=1#INF
-    ]
-]
-
-reader=[
-    readerType=UCIFastReader
-    file=$DataDir$/data26752_4k.txt
-    features=[
-        dim=$featureDim$
-        start=1
-    ]
-    labels=[
-        dim=1
-        start=0
-        labelDim=$labelDim$
-        labelMappingFile=$DataDir$/labelmap26752.txt
-        #labelMappingFile=$WorkDir$/labelmap.txt
-    ]
-]
diff --git a/tools/cntk/multinodes/fc/ffn.cntk b/tools/cntk/multinodes/fc/ffn.cntk
deleted file mode 100644
index 4f89735..0000000
--- a/tools/cntk/multinodes/fc/ffn.cntk
+++ /dev/null
@@ -1,87 +0,0 @@
-WorkDir=.
-ModelDir=$WorkDir$/Output/$ConfigName$
-#stderr=$WorkDir$/logs/$ConfigName$/out
-DataDir=/home/dl/data/cntk
-precision=float
-
-makeMode=false
-
-command=Train
-
-deviceId=1
-minibatchSize=1024
-epochSize=262144
-maxEpochs=2
-
-featureDim = 512
-labelDim = 1000
-hiddenDim = 2048
-
-initOnCPUOnly=true
-parallelTrain=false
-prefetch=true
-
-Train=[
-    action=train
-    modelPath=$ModelDir$/fc
-    traceLevel=1
-
-    SimpleNetworkBuilder=[
-        #layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
-        layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
-        trainingCriterion=CrossEntropyWithSoftmax
-        evalCriterion=ErrorPrediction
-        layerTypes=Sigmoid
-        applyMeanVarNorm=false
-        initValueScale=1.0
-        uniformInit=true
-        needPrior=false
-    ]
-    
-    SGD=[
-        #epochSize=262144
-        #minibatchSize=1024
-        #learningRatesPerMB=0.01
-        #numMBsToShowResult=4
-        #momentumPerSample=0
-        #dropoutRate=0.0
-        #maxEpochs=2
-        
-        epochSize=$epochSize$
-        minibatchSize=$minibatchSize$
-        maxEpochs=$maxEpochs$
-        learningRatesPerMB=0.01
-        numMBsToShowResult=4
-        momentumPerSample=0
-        dropoutRate=0.0
-        
-        ParallelTrain=[
-            parallelizationMethod=DataParallelSGD
-            distributedMBReading=true
-            parallelizationStartEpoch=1
-            DataParallelSGD=[
-                gradientBits=1
-            ]
-        ]
-
-        gradUpdateType=None
-        normWithAveMultiplier=true
-        clippingThresholdPerSample=1#INF
-    ]
-]
-
-reader=[
-    readerType=UCIFastReader
-    file=$DataDir$/data1000.txt
-    features=[
-        dim=$featureDim$
-        start=1
-    ]
-    labels=[
-        dim=1
-        start=0
-        labelDim=$labelDim$
-        labelMappingFile=$DataDir$/labelmap.1K.txt
-        #labelMappingFile=$WorkDir$/labelmap.txt
-    ]
-]
diff --git a/tools/cntk/rnn/lstm/lstm.cntk b/tools/cntk/rnn/lstm/lstm.cntk
index d70da11..1aaf20c 100644
--- a/tools/cntk/rnn/lstm/lstm.cntk
+++ b/tools/cntk/rnn/lstm/lstm.cntk
@@ -7,7 +7,7 @@ RootDir = "."
 ConfigDir = "$RootDir$"
 #DataDir   = "/home/comp/pengfeixu/data/cntk/ptb"
 #DataDir   = "/home/comp/csshshi/data/cntk/ptb"
-DataDir   = "/home/ipdps/data/cntk/ptb"
+DataDir   = "/home/comp/csshshi/data/cntk/ptb"
 OutputDir = "$RootDir$/Output"
 ModelDir  = "$OutputDir$/Models"
 
diff --git a/tools/mxnet/mxnetbm.py b/tools/mxnet/mxnetbm.py
index 9bdd708..bb4ad90 100644
--- a/tools/mxnet/mxnetbm.py
+++ b/tools/mxnet/mxnetbm.py
@@ -28,6 +28,7 @@
 # Set system variable
 os.environ['OMP_NUM_THREADS'] = args.cpuCount 
 os.environ['OPENBLAS_NUM_THREADS'] = args.cpuCount 
+os.environ['MKL_NUM_THREADS'] = args.cpuCount 
 
 # Build cmd
 exePath = ""
diff --git a/tools/tensorflow/cnn/alexnet/alexnet_cifar10.py b/tools/tensorflow/cnn/alexnet/alexnet_cifar10.py
index 203eaf8..9af79d1 100644
--- a/tools/tensorflow/cnn/alexnet/alexnet_cifar10.py
+++ b/tools/tensorflow/cnn/alexnet/alexnet_cifar10.py
@@ -134,11 +134,11 @@ def loss(logits, labels):
     batch_size = tf.size(labels)
     labels = tf.expand_dims(labels, 1)
     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
-    concated = tf.concat(1, [indices, labels])
+    concated = tf.concat(axis=1, values=[indices, labels])
     onehot_labels = tf.sparse_to_dense(
-        concated, tf.pack([batch_size, 10]), 1.0, 0.0)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
-                                                            onehot_labels,
+        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
+                                                            labels=onehot_labels,
                                                             name='xentropy')
     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
     return loss
@@ -216,10 +216,10 @@ def train():
       grad = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss_value)
 
       # Create a saver.
-      saver = tf.train.Saver(tf.all_variables())
+      saver = tf.train.Saver(tf.global_variables())
 
       # Build an initialization operation.
-      init = tf.initialize_all_variables()
+      init = tf.global_variables_initializer()
       # Start running operations on the Graph.
       sess.run(init)
       coord = tf.train.Coordinator()
diff --git a/tools/tensorflow/cnn/alexnet/alexnet_cifar10_multi_gpu1.py b/tools/tensorflow/cnn/alexnet/alexnet_cifar10_multi_gpu1.py
new file mode 100644
index 0000000..57a51bc
--- /dev/null
+++ b/tools/tensorflow/cnn/alexnet/alexnet_cifar10_multi_gpu1.py
@@ -0,0 +1,328 @@
+from datetime import datetime
+
+import time
+import cifar10_input
+#import unpickle as cifar10_input
+
+import tensorflow as tf
+import numpy as np
+import os
+
+FLAGS = tf.app.flags.FLAGS
+
+parameters = []
+device_str = ''
+
+conv_counter = 1
+pool_counter = 1
+norm_counter = 1
+affine_counter = 1
+pad_counter = 1
+
+FLAGS = tf.app.flags.FLAGS
+# Basic model parameters.
+tf.app.flags.DEFINE_integer('batch_size', 1024, """Number of images to process in a batch.""")
+tf.app.flags.DEFINE_integer('epochs', 40, """Max epochs for training.""")
+tf.app.flags.DEFINE_integer('log_step', 100, """Log step""")
+tf.app.flags.DEFINE_integer('eval_step', 1, """Evaluate step of epoch""")
+tf.app.flags.DEFINE_string('device_ids', '0,1', """Device ids. split by comma, e.g. 0,1""")
+#tf.app.flags.DEFINE_string('data_dir', '/home/comp/csshshi/data/tensorflow/cifar10/cifar-10-batches-bin', """Data directory""")
+tf.app.flags.DEFINE_string('data_dir', os.environ['HOME']+'/data/tensorflow/cifar10/cifar-10-batches-bin', """Data directory""")
+#tf.app.flags.DEFINE_string('data_dir', '/home/comp/pengfeixu/Data/tensorflow/cifar10/cifar-10-batches-bin', """Data directory""")
+tf.app.flags.DEFINE_string('train_dir', './trained_models/',
+                           """Path to the data directory.""")
+tf.app.flags.DEFINE_boolean('use_fp16', False,
+                            """Train the model using fp16.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', True,
+                            """Whether to log device placement.""")
+tf.app.flags.DEFINE_integer('num_gpus', 2, """How many GPUs to use.""")
+
+EPOCH_SIZE = 50000
+TEST_SIZE = 10000
+
+
+def _init_global_variables():
+    global conv_counter
+    global pool_counter
+    global norm_counter
+    global affine_counter
+    global pad_counter
+    conv_counter = 1
+    pool_counter = 1
+    norm_counter = 1
+    affine_counter = 1
+    pad_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.variable_scope(name) as scope:
+        #kernel = tf.get_variable(name='weights', initializer=tf.random_normal([kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-2))
+        kernel = tf.get_variable(name='weights', shape=[kH, kW, nIn, nOut], initializer=tf.truncated_normal_initializer(dtype=tf.float32, stddev=1e-2))
+        strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType)
+        #biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
+        #                     trainable=True, name='biases')
+        biases = tf.get_variable(name='biases', initializer=tf.constant(0.0, shape=[nOut], dtype=tf.float32), dtype=tf.float32)
+        bias = tf.reshape(tf.nn.bias_add(conv, biases),
+                          conv.get_shape())
+        parameters += [kernel, biases]
+        return bias
+
+
+def _relu(inpOp):
+    return tf.nn.relu(inpOp)
+
+
+def _padding(inpOp, pad):
+    global pad_counter 
+    name = 'pad' + str(pad_counter)
+    pad_counter += 1
+    with tf.name_scope(name) as scope:
+        padded_input = tf.pad(inpOp, [[0, 0], [pad, pad], [pad, pad], [0, 0]], "CONSTANT", name='pad')
+        print('padded_input: ', padded_input)
+        return padded_input
+
+
+def _norm(inpOp, local_size, alpha, beta):
+    global norm_counter
+    name = 'norm' + str(norm_counter)
+    norm = tf.nn.lrn(inpOp, local_size, bias=1.0, alpha=alpha, beta=beta, name=name)
+    return norm
+
+
+def _affine(inpOp, nIn, nOut):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.variable_scope(name) as scope:
+        #kernel = tf.get_variable(name='weights', initializer=tf.random_normal([nIn, nOut],
+        #                                         dtype=tf.float32,
+        #                                         stddev=1e-2))
+        kernel = tf.get_variable(name='weights', shape=[nIn, nOut], initializer=tf.truncated_normal_initializer(dtype=tf.float32,
+                                                 stddev=1e-2))
+        biases = tf.get_variable(name='biases', shape=[nOut], initializer=tf.constant_initializer())
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name)
+        parameters += [kernel, biases]
+        return affine1
+
+def _mpool(inpOp, kH, kW, dH, dW):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    ksize = [1, kH, kW, 1]
+    strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(inpOp,
+                          ksize=ksize,
+                          strides=strides,
+                          padding='VALID',
+                          name=name)
+
+def _avgpool(inpOp, kH, kW, dH, dW):
+    global pool_counter
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    ksize = [1, kH, kW, 1]
+    strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(inpOp,
+                          ksize=ksize,
+                          strides=strides,
+                          padding='VALID',
+                          name=name)
+
+def loss_function(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(axis=1, values=[indices, labels])
+    onehot_labels = tf.sparse_to_dense(
+        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
+                                                            labels=onehot_labels,
+                                                            name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+def inference(images):
+    pad1 = _padding(images, 2) 
+    conv1 = _conv (pad1, 3, 32, 5, 5, 1, 1, 'VALID')
+    pool1 = _mpool(conv1,  3, 3, 2, 2)
+    relu1 = _relu(pool1)
+    #norm1 = _norm(relu1, 3, 5e-05, 0.75)
+
+    pad2 = _padding(relu1, 2)
+    conv2 = _conv (pad2,  32, 32, 5, 5, 1, 1, 'VALID')
+    pool2 = _mpool(conv2,  3, 3, 2, 2)
+    relu2 = _relu(pool2)
+    #norm2 = _norm(relu2, 3, 5e-05, 0.75)
+
+    pad3 = _padding(relu2, 2)
+    conv3 = _conv (pad3,  32, 64, 5, 5, 1, 1, 'VALID')
+    relu3 = _relu(conv3)
+    pool3 = _avgpool(relu3, 3, 3, 2, 2) 
+    print('pool3: ', pool3)
+
+    resh1 = tf.reshape(pool3, [-1, 64 * 3 * 3])
+    affn1 = _affine(resh1, 64*3*3, 10)
+
+    return affn1
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+
+    Note that this function provides a synchronization point across all towers.
+
+    Args:
+      tower_grads: List of lists of (gradient, variable) tuples. The outer list
+        is over individual gradients. The inner list is over the gradient
+        calculation for each tower.
+    Returns:
+       List of pairs of (gradient, variable) where the gradient has been averaged
+       across all towers.
+    """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(axis=0, values=grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def train():
+    global parameters
+    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)
+    with tf.Graph().as_default(), tf.device("/cpu:0"):
+        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
+
+        device_ids = FLAGS.device_ids.split(',')
+        print('device_ids: ', device_ids)
+        if len(device_ids) > FLAGS.num_gpus:
+            print('The device_ids should have the same number of GPUs with num_gpus')
+            return
+
+        lr = 0.001
+        optimizer = tf.train.GradientDescentOptimizer(lr)
+        #optimizer = tf.train.MomentumOptimizer(lr, 0.9)
+
+        def assign_to_device(device, ps_device="/cpu:0"):
+            def _assign(op):
+                node_def = op if isinstance(op, tf.NodeDef) else op.node_def
+                if node_def.op == "Variable":
+                    return ps_device
+                else:
+                    return device
+            return _assign
+
+        tower_grads = []
+        average_loss_tensor = []
+        for i in xrange(FLAGS.num_gpus):
+            print('what is i: ', i)
+            #with tf.device(assign_to_device('/gpu:%s'%device_ids[i])):
+            with tf.device('/gpu:%s'%device_ids[i]):
+                with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as n_scope:
+                    _init_global_variables()
+                    images, labels = cifar10_input.inputs(False, FLAGS.data_dir, FLAGS.batch_size)
+                    logits = inference(images)
+                    loss = loss_function(logits, labels)
+
+                    tf.add_to_collection('losses', loss)
+                    tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+                    losses = tf.get_collection('losses', n_scope)
+                    total_loss = tf.add_n(losses, name='total_loss')
+                    average_loss_tensor.append(total_loss)
+
+                    tf.get_variable_scope().reuse_variables()
+                    print('total_loss: ', total_loss)
+                    grads = optimizer.compute_gradients(total_loss)
+                    print('grads: ', grads)
+
+                    tower_grads.append(grads)
+
+        print('tower_grads: ', tower_grads)
+        print('len0: ', len(tower_grads[0]))
+        print('len1: ', len(tower_grads[1]))
+
+        grads = average_gradients(tower_grads)
+        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
+        train_op = apply_gradient_op
+        average_op = tf.reduce_mean(average_loss_tensor, 0)
+
+        # Create a saver.
+        saver = tf.train.Saver(tf.global_variables())
+
+        init = tf.global_variables_initializer()
+        sess = tf.Session(config=config)
+        sess.run(init)
+        coord = tf.train.Coordinator()
+        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
+
+        real_batch_size = FLAGS.batch_size * FLAGS.num_gpus
+        num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size)
+        iterations = FLAGS.epochs * num_batches_per_epoch 
+        average_batch_time = 0.0
+        epochs_info = []
+
+        step = 0
+        average_loss = 0.0
+        for step in xrange(iterations):
+            start_time = time.time()
+            #_, loss_v = sess.run([train_op, total_loss])
+            _, loss_v = sess.run([train_op, average_op])
+            duration = time.time() - start_time
+            average_batch_time += float(duration)
+
+            assert not np.isnan(loss_v), 'Model diverged with loss = NaN'
+            average_loss += loss_v
+
+            if step % FLAGS.log_step == 0:
+                examples_per_sec = real_batch_size / duration
+                sec_per_batch = float(duration)
+                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
+                print (format_str % (datetime.now(), step, loss_v, examples_per_sec, sec_per_batch))
+
+            if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0:
+                average_loss /= num_batches_per_epoch * FLAGS.eval_step
+                print ('epoch: %d, loss: %.2f' % (step /num_batches_per_epoch, average_loss))
+                epochs_info.append('%d:_:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) 
+                average_loss = 0.0
+
+        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
+        saver.save(sess, checkpoint_path, global_step=step)
+
+        coord.request_stop()
+        coord.join(threads)
+
+        average_batch_time /= iterations
+        print 'average_batch_time: ', average_batch_time
+        print ('epoch_info: %s' % ','.join(epochs_info))
+
+
+def main(_):
+    train()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/tools/tensorflow/cnn/alexnet/report.txt b/tools/tensorflow/cnn/alexnet/report.txt
new file mode 100644
index 0000000..1cfc903
--- /dev/null
+++ b/tools/tensorflow/cnn/alexnet/report.txt
@@ -0,0 +1,78 @@
+--------------------------------------------------------------------------------
+Processing file 'alexnet_cifar10_multi_gpu.py'
+ outputting to 'alexnet_cifar10_multi_gpu1.py'
+--------------------------------------------------------------------------------
+
+'alexnet_cifar10_multi_gpu.py' Line 201
+--------------------------------------------------------------------------------
+
+Added keyword 'concat_dim' to reordered function 'tf.concat'
+Added keyword 'values' to reordered function 'tf.concat'
+
+    Old:         grad = tf.concat(0, grads)
+                                            
+    New:         grad = tf.concat(axis=0, values=grads)
+                                  ~~~~~   ~~~~~~~       
+
+'alexnet_cifar10_multi_gpu.py' Line 274
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.all_variables' to 'tf.global_variables'
+
+    Old:         saver = tf.train.Saver(tf.all_variables())
+                                        ~~~~~~~~~~~~~~~~    
+    New:         saver = tf.train.Saver(tf.global_variables())
+                                        ~~~~~~~~~~~~~~~~~~~    
+
+'alexnet_cifar10_multi_gpu.py' Line 142
+--------------------------------------------------------------------------------
+
+Added keyword 'concat_dim' to reordered function 'tf.concat'
+Added keyword 'values' to reordered function 'tf.concat'
+
+    Old:     concated = tf.concat(1, [indices, labels])
+                                                        
+    New:     concated = tf.concat(axis=1, values=[indices, labels])
+                                  ~~~~~   ~~~~~~~                   
+
+'alexnet_cifar10_multi_gpu.py' Line 144
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.pack' to 'tf.stack'
+
+    Old:         concated, tf.pack([batch_size, 10]), 1.0, 0.0)
+                           ~~~~~~~                              
+    New:         concated, tf.stack([batch_size, 10]), 1.0, 0.0)
+                           ~~~~~~~~                              
+
+'alexnet_cifar10_multi_gpu.py' Line 145
+--------------------------------------------------------------------------------
+
+Added keyword 'logits' to reordered function 'tf.nn.softmax_cross_entropy_with_logits'
+
+    Old:     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
+                                                                             
+    New:     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
+                                                                     ~~~~~~~        
+
+'alexnet_cifar10_multi_gpu.py' Line 146
+--------------------------------------------------------------------------------
+
+Added keyword 'labels' to reordered function 'tf.nn.softmax_cross_entropy_with_logits'
+
+    Old:                                                             onehot_labels,
+                                                                                    
+    New:                                                             labels=onehot_labels,
+                                                                     ~~~~~~~               
+
+'alexnet_cifar10_multi_gpu.py' Line 276
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.initialize_all_variables' to 'tf.global_variables_initializer'
+
+    Old:         init = tf.initialize_all_variables()
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+    New:         init = tf.global_variables_initializer()
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+
+
diff --git a/tools/tensorflow/fc/fcn5_mnist.py b/tools/tensorflow/fc/fcn5_mnist.py
index 535c80e..6e1fccb 100644
--- a/tools/tensorflow/fc/fcn5_mnist.py
+++ b/tools/tensorflow/fc/fcn5_mnist.py
@@ -1,5 +1,5 @@
 import tensorflow as tf
-import models
+import models1 as models
 import time
 import os
 import numpy as np
@@ -84,7 +84,7 @@ def train(model='fcn5'):
         #optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
         optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss)
 
-        init = tf.initialize_all_variables()
+        init = tf.global_variables_initializer()
         sess.run(init)
         tf.train.start_queue_runners(sess=sess)
         batch_size_per_epoch = int((EPOCH_SIZE + FLAGS.batch_size - 1)/ FLAGS.batch_size)
diff --git a/tools/tensorflow/fc/fcn5_mnist_multi_gpu1.py b/tools/tensorflow/fc/fcn5_mnist_multi_gpu1.py
new file mode 100644
index 0000000..b555ada
--- /dev/null
+++ b/tools/tensorflow/fc/fcn5_mnist_multi_gpu1.py
@@ -0,0 +1,228 @@
+import os
+import tensorflow as tf
+import models
+import time
+import numpy as np
+from datetime import datetime
+from tensorflow.examples.tutorials.mnist import input_data
+
+
+FLAGS = tf.app.flags.FLAGS
+# Basic model parameters.
+
+tf.app.flags.DEFINE_string('train_dir', './multigpu-trained',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('batch_size', 1024, """Number of images to process in a batch.""")
+tf.app.flags.DEFINE_integer('epochs', 40, """Max epochs for training.""")
+tf.app.flags.DEFINE_integer('log_step', 10, """Log step""")
+tf.app.flags.DEFINE_integer('eval_step', 1, """Evaluate step of epoch""")
+tf.app.flags.DEFINE_string('device_ids', '0,1', """Device ids. split by comma, e.g. 0,1""")
+#tf.app.flags.DEFINE_string('data_dir', '/home/comp/csshshi/data/tensorflow/MNIST_data/',
+tf.app.flags.DEFINE_string('data_dir', os.environ['HOME']+'/data/tensorflow/MNIST_data/',
+#tf.app.flags.DEFINE_string('data_dir', '/home/comp/pengfeixu/Data/tensorflow/MNIST_data/',
+                           """Path to the data directory.""")
+tf.app.flags.DEFINE_boolean('use_fp16', False,
+                            """Train the model using fp16.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', True,
+                            """Whether to log device placement.""")
+tf.app.flags.DEFINE_integer('num_gpus', 2, """How many GPUs to use.""")
+
+EPOCH_SIZE = 60000
+TEST_SIZE = 10000
+
+
+def createFakeData(count, featureDim, labelDim):
+    features = np.random.randn(count, featureDim)
+    labels = np.random.randint(0, labelDim, size=(count, 1))
+    return features, labels
+
+features, labels = createFakeData(1024, 32*32*3, 10)
+
+
+def getFakeMinibatch(minibatchSize, labelDim):
+    feat = features[:minibatchSize]
+    l = labels[:minibatchSize]
+    lab = np.zeros((minibatchSize, labelDim))
+    for i in range(lab.shape[0]):
+        lab[i][l[i]] = 1
+    return feat, lab
+
+mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+
+
+def get_real_batch_data(batch_size, label_dim):
+    batch_xs, batch_ys = mnist.train.next_batch(batch_size)
+    return batch_xs, batch_ys 
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+
+    Note that this function provides a synchronization point across all towers.
+
+    Args:
+      tower_grads: List of lists of (gradient, variable) tuples. The outer list
+        is over individual gradients. The inner list is over the gradient
+        calculation for each tower.
+    Returns:
+       List of pairs of (gradient, variable) where the gradient has been averaged
+       across all towers.
+    """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+      # Note that each grad_and_vars looks like the following:
+      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+      grads = []
+      for g, _ in grad_and_vars:
+        # Add 0 dimension to the gradients to represent the tower.
+        expanded_g = tf.expand_dims(g, 0)
+
+        # Append on a 'tower' dimension which we will average over below.
+        grads.append(expanded_g)
+
+      # Average over the 'tower' dimension.
+      grad = tf.concat(axis=0, values=grads)
+      grad = tf.reduce_mean(grad, 0)
+
+      # Keep in mind that the Variables are redundant because they are shared
+      # across towers. So .. we will just return the first tower's pointer to
+      # the Variable.
+      v = grad_and_vars[0][1]
+      grad_and_var = (grad, v)
+      average_grads.append(grad_and_var)
+    return average_grads
+
+
+
+
+def train(model='fcn5'):
+    if FLAGS.num_gpus < 2:
+        print("The number of GPU should be 2 or more, if you use one GPU, please use fcn5_mnist.py to train")
+        return
+
+    config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement)
+
+    with tf.Graph().as_default(), tf.device("/cpu:0"):
+        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
+
+        device_ids = FLAGS.device_ids.split(',')
+        if len(device_ids) > FLAGS.num_gpus:
+            print('The device_ids should have the same number of GPUs with num_gpus')
+            return
+
+        lr = 0.05
+        optimizer = tf.train.GradientDescentOptimizer(lr)
+
+        # TF1.0 has error with this momentum optimizer, it should be fixed...
+        #optimizer = tf.train.MomentumOptimizer(lr, 0.9)
+
+        def assign_to_device(device, ps_device="/cpu:0"):
+            def _assign(op):
+                node_def = op if isinstance(op, tf.NodeDef) else op.node_def
+                if node_def.op == "Variable":
+                    return ps_device
+                else:
+                    return device
+            return _assign
+
+        tower_grads = []
+        feed_vars = []
+        average_loss_tensor = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device(assign_to_device('/gpu:%s'%device_ids[i])):
+                with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope:
+                    feature_dim = models.feature_dim
+                    label_dim = models.label_dim
+                    images = tf.placeholder(tf.float32, [None, feature_dim], name='images')
+                    labels = tf.placeholder(tf.float32, [None, label_dim], name='labels')
+                    feed_vars.append((images, labels))
+
+                    logits = models.model_fcn5(images)
+                    loss = models.loss(logits, labels)
+                    tf.add_to_collection('losses', loss)
+
+                    #tf.add_n(tf.get_collection('losses'), name='total_loss')
+                    losses = tf.get_collection('losses', scope)
+                    total_loss = tf.add_n(losses, name='total_loss')
+                    average_loss_tensor.append(total_loss)
+
+                    tf.get_variable_scope().reuse_variables()
+                    grads = optimizer.compute_gradients(total_loss)
+                    tower_grads.append(grads)
+
+        print('tower_grads: ', tower_grads, '\nlen: ', len(tower_grads))
+        print ('total_loss: ', total_loss)
+
+        grads = average_gradients(tower_grads)
+        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
+
+        train_op = apply_gradient_op
+        average_op = tf.reduce_mean(average_loss_tensor, 0)
+        saver = tf.train.Saver(tf.global_variables())
+
+        init = tf.global_variables_initializer()
+        sess = tf.Session(config=config)
+        sess.run(init)
+
+        # Add initialize for other variables
+        uninitialized_vars = []
+        for var in tf.all_variables():
+            try:
+                sess.run(var)
+            except tf.errors.FailedPreconditionError:
+                uninitialized_vars.append(var)
+        init_new_vars_op = tf.initialize_variables(uninitialized_vars)
+        sess.run(init_new_vars_op)
+
+        tf.train.start_queue_runners(sess=sess)
+
+        real_batch_size = FLAGS.batch_size * FLAGS.num_gpus
+        num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size)
+        iterations = FLAGS.epochs * num_batches_per_epoch 
+        average_batch_time = 0.0
+        epochs_info = []
+
+        step = 0
+        average_loss = 0.0
+        for step in range(iterations):
+            start_time = time.time()
+            imgs, labs = get_real_batch_data(real_batch_size, 10)
+            feed_dict = {}
+            for i in range(FLAGS.num_gpus):
+                feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size]
+                feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] 
+           # _, loss_value = sess.run([train_op, total_loss], feed_dict=feed_dict)
+            _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict)
+            duration = time.time() - start_time
+            average_batch_time += float(duration)
+            average_loss += loss_value
+
+            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
+
+            if step % FLAGS.log_step == 0:
+                examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration
+                sec_per_batch = float(duration)
+                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
+                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
+
+            if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0:
+                average_loss /= num_batches_per_epoch * FLAGS.eval_step
+                print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss))
+                epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) 
+                average_loss = 0.0
+
+        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
+        saver.save(sess, checkpoint_path, global_step=step)
+
+        average_batch_time /= iterations
+        print 'average_batch_time: ', average_batch_time
+        print ('epoch_info: %s' % ','.join(epochs_info))
+
+
+def main(argv=None):
+    train(model='fcn5')
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/tools/tensorflow/fc/models.py b/tools/tensorflow/fc/models.py
index ed94bf3..93c4dfb 100644
--- a/tools/tensorflow/fc/models.py
+++ b/tools/tensorflow/fc/models.py
@@ -51,7 +51,7 @@ def model_fcn8(features):
 
 def loss(logits, labels):
     labels = tf.cast(labels, tf.float32)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
     loss = tf.reduce_mean(cross_entropy, name='cross_entropy_mean')
     return loss
 
diff --git a/tools/tensorflow/fc/report.txt b/tools/tensorflow/fc/report.txt
new file mode 100644
index 0000000..df29a5e
--- /dev/null
+++ b/tools/tensorflow/fc/report.txt
@@ -0,0 +1,37 @@
+--------------------------------------------------------------------------------
+Processing file 'fcn5_mnist_multi_gpu.py'
+ outputting to 'fcn5_mnist_multi_gpu1.py'
+--------------------------------------------------------------------------------
+
+'fcn5_mnist_multi_gpu.py' Line 160
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.all_variables' to 'tf.global_variables'
+
+    Old:         saver = tf.train.Saver(tf.all_variables())
+                                        ~~~~~~~~~~~~~~~~    
+    New:         saver = tf.train.Saver(tf.global_variables())
+                                        ~~~~~~~~~~~~~~~~~~~    
+
+'fcn5_mnist_multi_gpu.py' Line 162
+--------------------------------------------------------------------------------
+
+Renamed function 'tf.initialize_all_variables' to 'tf.global_variables_initializer'
+
+    Old:         init = tf.initialize_all_variables()
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+    New:         init = tf.global_variables_initializer()
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~   
+
+'fcn5_mnist_multi_gpu.py' Line 85
+--------------------------------------------------------------------------------
+
+Added keyword 'concat_dim' to reordered function 'tf.concat'
+Added keyword 'values' to reordered function 'tf.concat'
+
+    Old:       grad = tf.concat(0, grads)
+                                          
+    New:       grad = tf.concat(axis=0, values=grads)
+                                ~~~~~   ~~~~~~~       
+
+
diff --git a/tools/tensorflow/tensorflowbm.py b/tools/tensorflow/tensorflowbm.py
index 446d6ab..519099a 100644
--- a/tools/tensorflow/tensorflowbm.py
+++ b/tools/tensorflow/tensorflowbm.py
@@ -26,6 +26,7 @@
 # Set system variable
 os.environ['OMP_NUM_THREADS'] = args.cpuCount 
 os.environ['OPENBLAS_NUM_THREADS'] = args.cpuCount 
+os.environ['MKL_NUM_THREADS'] = args.cpuCount 
 # Build cmd for benchmark
 root_path = os.path.dirname(os.path.abspath(__file__))
 tool_path = root_path + "/" + args.netType
diff --git a/tools/torch/torchbm.py b/tools/torch/torchbm.py
index 3d4e848..ee88948 100644
--- a/tools/torch/torchbm.py
+++ b/tools/torch/torchbm.py
@@ -25,8 +25,9 @@
 if args.debug: print("args: " + str(args))
 
 # Set system variable
-#os.environ['OMP_NUM_THREADS'] = args.cpuCount 
-#os.environ['OPENBLAS_NUM_THREADS'] = args.cpuCount 
+os.environ['OMP_NUM_THREADS'] = args.cpuCount 
+os.environ['OPENBLAS_NUM_THREADS'] = args.cpuCount 
+os.environ['MKL_NUM_THREADS'] = args.cpuCount 
 
 # Build cmd
 cmd = "THC_CACHING_ALLOCATOR=1 th Main.lua "
@@ -46,7 +47,7 @@
 		if "-" not in args.devId:
 			cmd = "THC_CACHING_ALLOCATOR=1 CUDA_VISIBLE_DEVICES=" + args.devId  + " th rnn/recurrent-language-model.lua --cuda " 
 		else:
-			cmd = "OMP_NUM_THREADS=%s OPENBLAS_NUM_THREADS=%s th rnn/recurrent-language-model.lua --lstm --startlr 1 " % (args.cpuCount, args.cpuCount)
+			cmd = "OMP_NUM_THREADS=%s OPENBLAS_NUM_THREADS=%s MKL_NUM_THREADS=%s th rnn/recurrent-language-model.lua --lstm --startlr 1 " % (args.cpuCount, args.cpuCount, args.cpuCount)
 	else:
 		print("Device not set, please set device by adding -devId <-1 or 0,1,2,3>. See help for more")
 		sys.exit(-2)