From e648370f23719abd2b398621ba3ffc9457a55955 Mon Sep 17 00:00:00 2001
From: Connor Goggins <cgoggins0@gmail.com>
Date: Tue, 11 Feb 2020 14:37:02 -0800
Subject: [PATCH 1/4] Fixed ordering, added warmup & runs to argparse and
 individual benchmark function calls

---
 benchmark/opperf/opperf.py                | 55 ++++++++++++++---------
 benchmark/opperf/utils/benchmark_utils.py |  4 +-
 benchmark/opperf/utils/common_utils.py    | 11 ++++-
 benchmark/opperf/utils/profiler_utils.py  | 15 +++++--
 4 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
index 5f4c8ee9cf0e..dc71190e8659 100755
--- a/benchmark/opperf/opperf.py
+++ b/benchmark/opperf/opperf.py
@@ -50,7 +50,7 @@
     get_current_runtime_features
 
 
-def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native'):
+def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Run all the MXNet operators (NDArray) benchmarks.
 
     Returns
@@ -62,61 +62,61 @@ def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************
 
     # Run all Unary operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
-                                                                                         dtype=dtype, profiler=profiler))
+                                                                                         dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
     mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx,
-                                                                                            dtype=dtype, profiler=profiler))
+                                                                                            dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     mxnet_operator_benchmark_results.append(run_mx_binary_misc_operators_benchmarks(ctx=ctx,
-                                                                                         dtype=dtype, profiler=profiler))
+                                                                                         dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all GEMM operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx,
-                                                                          dtype=dtype, profiler=profiler))
+                                                                          dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Random sampling operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Reduction operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Sorting and Searching operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Array Rearrange operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Indexing routines benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # ************************ MXNET NN OPERATOR BENCHMARKS ****************************
 
     # Run all basic NN operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Activation operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Pooling operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Convolution operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Optimizer operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Transpose Convolution operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all NN loss operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Linear Algebra operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler))
+    mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # ****************************** PREPARE FINAL RESULTS ********************************
     final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results)
@@ -159,6 +159,14 @@ def main():
                              'time module.'
                              'Valid Inputs - native, python')
 
+    parser.add_argument('-w', '--warmup', type=int, default=25,
+                        help='Number of times to run for warmup.'
+                             'Valid Inputs - positive integers')
+
+    parser.add_argument('-r', '--runs', type=int, default=100,
+                        help='Number of runs to capture benchmark results.'
+                             'Valid Inputs - positive integers')
+
     args = parser.parse_args()
     logging.info("Running MXNet operator benchmarks with the following options: {args}".format(args=args))
     assert not os.path.isfile(args.output_file),\
@@ -168,7 +176,14 @@ def main():
     ctx = _parse_mxnet_context(args.ctx)
     dtype = args.dtype
     profiler = args.profiler
-    final_benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler)
+    warmup = args.warmup
+    runs = args.runs
+    benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)
+
+    # Sort benchmark results alphabetically by op name
+    final_benchmark_results = dict()
+    for key in sorted(benchmark_results.keys()):
+        final_benchmark_results[key] = benchmark_results[key]
 
     # 3. PREPARE OUTPUTS
     run_time_features = get_current_runtime_features()
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
index 421fe314267e..da7e2b8910aa 100644
--- a/benchmark/opperf/utils/benchmark_utils.py
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -80,14 +80,14 @@ def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, ar
             _, profiler_output = benchmark_helper_func(op, runs, [], **kwargs)
 
             # Add inputs used for profiling this operator into result
-            profiler_output["inputs"] = inputs[idx]
+            profiler_output = merge_map_list([{"inputs": inputs[idx]}] + [profiler_output])
             op_benchmark_result[op.__name__].append(profiler_output)
     else:
         for idx, (args, kwargs) in enumerate(zip(args_list, kwargs_list)):
             _, profiler_output = benchmark_helper_func(op, runs, args, **kwargs)
 
             # Add inputs used for profiling this operator into result
-            profiler_output["inputs"] = inputs[idx]
+            profiler_output = merge_map_list([{"inputs": inputs[idx]}] + [profiler_output])
             op_benchmark_result[op.__name__].append(profiler_output)
     logging.info("Complete Benchmark - {name}".format(name=op.__name__))
     return op_benchmark_result
diff --git a/benchmark/opperf/utils/common_utils.py b/benchmark/opperf/utils/common_utils.py
index 924d2fa9a23b..3eb8254e1132 100644
--- a/benchmark/opperf/utils/common_utils.py
+++ b/benchmark/opperf/utils/common_utils.py
@@ -41,7 +41,14 @@ def merge_map_list(map_list):
     map where all individual maps in the into map_list are merged
 
     """
-    return dict(ChainMap(*map_list))
+    # Preserve order of underlying maps and keys when converting to a single map
+    final_map = dict()
+
+    for current_map in map_list:
+        for key in current_map:
+            final_map[key] =  current_map[key]
+
+    return final_map
 
 
 def save_to_file(inp_dict, out_filepath, out_format='json', runtime_features=None, profiler='native'):
@@ -65,7 +72,7 @@ def save_to_file(inp_dict, out_filepath, out_format='json', runtime_features=Non
     if out_format == 'json':
         # Save as JSON
         with open(out_filepath, "w") as result_file:
-            json.dump(inp_dict, result_file, indent=4, sort_keys=True)
+            json.dump(inp_dict, result_file, indent=4, sort_keys=False)
     elif out_format == 'md':
         # Save as md
         with open(out_filepath, "w") as result_file:
diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py
index 45322c1066cf..874d27f069fb 100644
--- a/benchmark/opperf/utils/profiler_utils.py
+++ b/benchmark/opperf/utils/profiler_utils.py
@@ -58,14 +58,24 @@ def _get_operator_profile(operator_name, operator_profile_results):
     else:
         op_name = operator_name
 
+    # Variables to store forward/backward performance results
+    forward_res, backward_res = None, None
+
     for line in operator_profile_results:
         if op_name in line or op_name[:3] + " " in line:
             operation = line.split()[0]
             operation_avg_time = float(line.split()[-1])
             if "_backward" in operation:
-                operator_profile["avg_time_backward_" + operator_name] = operation_avg_time
+                backward_res = operation_avg_time
             else:
-                operator_profile["avg_time_forward_" + operator_name] = operation_avg_time
+                forward_res = operation_avg_time
+
+    # Add forward and backward performance results to the dict in the correct order
+    if forward_res:
+        operator_profile["avg_time_forward_" + operator_name] = forward_res
+
+    if backward_res:
+        operator_profile["avg_time_backward_" + operator_name] = backward_res
 
     return operator_profile
 
@@ -149,7 +159,6 @@ def parse_profiler_dump(operator_name, profiler_dump):
     # Prepare results
     memory_profile = _get_memory_profile(memory_profile_results)
     operator_profile = _get_operator_profile(operator_name, operator_profile_results)
-
     return merge_map_list([memory_profile, operator_profile])
 
 

From 5a6265323b1ca7405aeaa2344bd58cb62c56975a Mon Sep 17 00:00:00 2001
From: Connor Goggins <cgoggins0@gmail.com>
Date: Tue, 11 Feb 2020 15:04:56 -0800
Subject: [PATCH 2/4] Dropped unused ChainMap

---
 benchmark/opperf/utils/common_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmark/opperf/utils/common_utils.py b/benchmark/opperf/utils/common_utils.py
index 3eb8254e1132..9af135eed34e 100644
--- a/benchmark/opperf/utils/common_utils.py
+++ b/benchmark/opperf/utils/common_utils.py
@@ -19,8 +19,6 @@
 import json
 from operator import itemgetter
 
-from collections import ChainMap
-
 import logging
 logging.basicConfig(level=logging.INFO)
 

From 319775162c989717b95ff059decc2fdf7457c987 Mon Sep 17 00:00:00 2001
From: Connor Goggins <cgoggins0@gmail.com>
Date: Tue, 11 Feb 2020 15:31:20 -0800
Subject: [PATCH 3/4] Added newline for consistency with previous changes

---
 benchmark/opperf/utils/profiler_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py
index 874d27f069fb..fa959bf5a8b1 100644
--- a/benchmark/opperf/utils/profiler_utils.py
+++ b/benchmark/opperf/utils/profiler_utils.py
@@ -159,6 +159,7 @@ def parse_profiler_dump(operator_name, profiler_dump):
     # Prepare results
     memory_profile = _get_memory_profile(memory_profile_results)
     operator_profile = _get_operator_profile(operator_name, operator_profile_results)
+
     return merge_map_list([memory_profile, operator_profile])
 
 

From 0bce87975bb2dd575a2292656fad8276d01822e0 Mon Sep 17 00:00:00 2001
From: Connor Goggins <cgoggins0@gmail.com>
Date: Tue, 11 Feb 2020 16:57:42 -0800
Subject: [PATCH 4/4] Adjusted markdown output ordering

---
 benchmark/opperf/utils/common_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/opperf/utils/common_utils.py b/benchmark/opperf/utils/common_utils.py
index 9af135eed34e..fcf52d4377dd 100644
--- a/benchmark/opperf/utils/common_utils.py
+++ b/benchmark/opperf/utils/common_utils.py
@@ -127,7 +127,7 @@ def _prepare_op_benchmark_result(op, op_bench_result, profiler):
     result = ""
     if profiler == "native":
         result = "| {} | {} | {} | {} | {} |".format(operator_name,
-                 avg_forward_time, avg_backward_time, max_mem_usage, inputs)
+                 inputs, max_mem_usage, avg_forward_time, avg_backward_time)
     elif profiler == "python":
         result = "| {} | {} | {} | {} | {} | {} |".format(operator_name, avg_time, p50_time, p90_time, p99_time, inputs)
     return result
@@ -144,8 +144,8 @@ def _prepare_markdown(results, runtime_features=None, profiler='native'):
     results_markdown.append("# Benchmark Results")
     if profiler == 'native':
         results_markdown.append(
-            "| Operator | Avg Forward Time (ms) | Avg. Backward Time (ms) | Max Mem Usage (Storage) (Bytes)"
-            " | Inputs |")
+            "| Operator | Inputs | Max Mem Usage (Storage) (Bytes) | Avg Forward Time (ms)"
+            " | Avg. Backward Time (ms) |")
         results_markdown.append("| :---: | :---: | :---: | :---: | :---: |")
     elif profiler == 'python':
         results_markdown.append(