merge from upstream

apache · Aug 23, 2019 · 1da8878 · 1da8878
2 parents 299010b + 9023256
commit 1da8878
Show file tree

Hide file tree

Showing 114 changed files with 4,732 additions and 1,651 deletions.
diff --git a/benchmark/opperf/README.md b/benchmark/opperf/README.md
@@ -47,7 +47,10 @@ Hence, in this utility, we will build the functionality to allow users and devel
 
 ## Prerequisites
 
-Make sure to build the flavor of MXNet, for example - with/without MKL, with CUDA 9 or 10.1 etc., on which you would like to measure operator performance. Finally, you need to add path to your cloned MXNet repository to the PYTHONPATH.
+Provided you have MXNet installed (any version >= 1.5.1), all you need to use opperf utility is to add path to your cloned MXNet repository to the PYTHONPATH.
+
+Note: 
+To install MXNet, refer [Installing MXNet page](https://mxnet.incubator.apache.org/versions/master/install/index.html)
 
 ```
 export PYTHONPATH=$PYTHONPATH:/path/to/incubator-mxnet/
@@ -75,7 +78,7 @@ For example, you want to run benchmarks for all NDArray Broadcast Binary Operato
 
 ```
 #!/usr/bin/python
-from benchmark.opperf.nd_operations.binary_broadcast_operators import run_mx_binary_broadcast_operators_benchmarks
+from benchmark.opperf.nd_operations.binary_operators import run_mx_binary_broadcast_operators_benchmarks
 
 # Run all Binary Broadcast operations benchmarks with default input values
 print(run_mx_binary_broadcast_operators_benchmarks())
@@ -136,7 +139,7 @@ from mxnet import nd
 
 from benchmark.opperf.utils.benchmark_utils import run_performance_test
 
-add_res = run_performance_test([nd.add, nd.sub], run_backward=True, dtype='float32', ctx=mx.cpu(),
+add_res = run_performance_test([nd.add, nd.subtract], run_backward=True, dtype='float32', ctx=mx.cpu(),
                                inputs=[{"lhs": (1024, 1024),
                                         "rhs": (1024, 1024)}],
                                warmup=10, runs=25)

diff --git a/benchmark/opperf/nd_operations/array_rearrange.py b/benchmark/opperf/nd_operations/array_rearrange.py
@@ -29,7 +29,7 @@
 """
 
 
-def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the
     rearrange operators  in MXNet.
 
@@ -53,5 +53,5 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25,
     mx_rearrange_ops = get_all_rearrange_operators()
 
     # Run benchmarks
-    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, warmup, runs)
+    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, warmup, runs)
     return mx_rearrange_op_results
diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
@@ -38,7 +38,7 @@
     get_all_elemen_wise_binary_operators
 
 
-def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the binary
     broadcast operators in MXNet.
 
@@ -61,11 +61,11 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
     # Fetch all Binary Broadcast Operators
     mx_binary_broadcast_ops = get_all_broadcast_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the binary
     element_wise operators in MXNet.
 
@@ -88,5 +88,5 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
     # Fetch all Binary Element_wise Operators
     mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, warmup, runs)
     return mx_binary_op_results
diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -34,7 +34,7 @@
 """
 
 
-def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the GEMM
     operators (dot, batch_dot) in MXNet.
 
@@ -67,7 +67,7 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs
                  "rhs": (100, 1000),
                  "transpose_a": True,
                  "transpose_b": True}],
-        warmup=warmup, runs=runs)
+        warmup=warmup, runs=runs, profiler=profiler)
 
     batch_dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True,
@@ -81,7 +81,7 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs
                  "rhs": (32, 100, 1000),
                  "transpose_a": True,
                  "transpose_b": True}],
-        warmup=warmup, runs=runs)
+        warmup=warmup, runs=runs, profiler=profiler)
 
     # Prepare combined results for GEMM operators
     mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res)

diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py
@@ -35,7 +35,7 @@
 """
 
 
-def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the activation
     operators (relu, sigmoid, softmax) in MXNet.
 
@@ -60,6 +60,7 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25
                                               run_backward=True,
                                               dtype=dtype,
                                               ctx=ctx,
+                                              profiler=profiler,
                                               inputs=[{"data": (1024, 1024), "act_type": "leaky", "slope": 0.1},
                                                       {"data": (10000, 1), "act_type": "leaky", "slope": 0.1},
                                                       {"data": (10000, 100), "act_type": "leaky", "slope": 0.1},
@@ -82,6 +83,7 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25
                                                       run_backward=True,
                                                       dtype=dtype,
                                                       ctx=ctx,
+                                                      profiler=profiler,
                                                       inputs=[{"data": (1024, 1024), "alpha": 0.25, "beta": 0.5},
                                                               {"data": (10000, 1), "alpha": 0.25, "beta": 0.5},
                                                               {"data": (10000, 100), "alpha": 0.25, "beta": 0.5}
@@ -95,6 +97,7 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25
                                                  run_backward=True,
                                                  dtype=dtype,
                                                  ctx=ctx,
+                                                 profiler=profiler,
                                                  inputs=[{"data": (1024, 1024), "axis": -1, "temperature": 0.5},
                                                          {"data": (10000, 1), "axis": -1, "temperature": 0.5},
                                                          {"data": (10000, 100), "axis": -1, "temperature": 0.5}

diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -29,12 +29,13 @@
 """
 
 
-def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     # FullyConnnected operator benchmarks
     fc_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "FullyConnected")],
                                             run_backward=True,
                                             dtype=dtype,
                                             ctx=ctx,
+                                            profiler=profiler,
                                             inputs=[{"data": (32, 3, 256, 256),
                                                      "num_hidden": 64,
                                                      "weight": (64, 3 * 256 * 256),
@@ -53,6 +54,7 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25,
                                                  run_backward=True,
                                                  dtype=dtype,
                                                  ctx=ctx,
+                                                 profiler=profiler,
                                                  inputs=[{"data": (32, 3, 256, 256),
                                                           "p": 0.5,
                                                           "mode": "always"},
@@ -66,6 +68,7 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25,
                                                    run_backward=True,
                                                    dtype=dtype,
                                                    ctx=ctx,
+                                                   profiler=profiler,
                                                    inputs=[{"data": (32, 3, 256, 256),
                                                             "gamma": (3,),
                                                             "beta": (3,),

diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py
@@ -51,7 +51,7 @@
 """
 
 
-def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     pool_types = ['avg', 'max', 'sum']
     global_pool_types = [0, 1]
 
@@ -65,6 +65,7 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, r
                                                              run_backward=True,
                                                              dtype=dtype,
                                                              ctx=ctx,
+                                                             profiler=profiler,
                                                              inputs=[{"data": pool1d_data,
                                                                       "kernel": 3,
                                                                       "pool_type": pool_type,
@@ -79,6 +80,7 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, r
                                                              run_backward=True,
                                                              dtype=dtype,
                                                              ctx=ctx,
+                                                             profiler=profiler,
                                                              inputs=[{"data": pool2d_data,
                                                                       "kernel": (3, 3),
                                                                       "pool_type": pool_type,
@@ -93,14 +95,15 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, r
     return mx_pooling_op_results
 
 
-def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     # Conv1D Benchmarks
     conv1d_benchmark_res = []
     for conv_data in [(32, 3, 256), (32, 3, 64)]:
         conv1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
+                                                     profiler=profiler,
                                                      inputs=[{"data": conv_data,
                                                               "weight": (64, 3, 3),
                                                               "bias": (64,),
@@ -120,6 +123,7 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=2
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
+                                                     profiler=profiler,
                                                      inputs=[{"data": conv_data,
                                                               "weight": (64, 3, 3, 3),
                                                               "bias": (64,),
@@ -137,14 +141,15 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=2
     return mx_conv_op_results
 
 
-def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', dtype='float32', warmup=10, runs=50):
     # Conv1DTranspose Benchmarks
     conv1d_transpose_benchmark_res = []
     for conv_data in [(32, 3, 256), (32, 3, 64)]:
         conv1d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
                                                                run_backward=True,
                                                                dtype=dtype,
                                                                ctx=ctx,
+                                                               profiler=profiler,
                                                                inputs=[{"data": conv_data,
                                                                         "weight": (3, 64, 3),
                                                                         "bias": (64,),
@@ -166,6 +171,7 @@ def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32'
                                                                run_backward=True,
                                                                dtype=dtype,
                                                                ctx=ctx,
+                                                               profiler=profiler,
                                                                inputs=[{"data": conv_data,
                                                                         "weight": (3, 64, 3, 3),
                                                                         "bias": (64,),

diff --git a/benchmark/opperf/nd_operations/nn_optimizer_operators.py b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
@@ -36,7 +36,7 @@
 """
 
 
-def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the neural network
     optimizer update operators in MXNet.
 
@@ -60,5 +60,5 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25,
     mx_optimizer_ops = get_all_optimizer_operators()
 
     # Run benchmarks
-    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, warmup, runs)
+    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs)
     return mx_optimizer_op_results
diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py
@@ -34,7 +34,7 @@
 from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators
 
 
-def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the random sampling
     operators in MXNet.
 
@@ -57,5 +57,5 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', w
     # Fetch all Random Sampling Operators
     mx_random_sample_ops = get_all_random_sampling_operators()
     # Run benchmarks
-    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, warmup, runs)
+    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, warmup, runs)
     return mx_random_sample_op_results
diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py
@@ -31,7 +31,7 @@
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
 
-def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the reduction
     operators in MXNet.
 
@@ -54,5 +54,5 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=
     # Fetch all Reduction Operators
     mx_reduction_broadcast_ops = get_all_reduction_operators()
     # Run benchmarks
-    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, warmup, runs)
+    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, warmup, runs)
     return mx_reduction_op_results
diff --git a/benchmark/opperf/nd_operations/sorting_searching_operators.py b/benchmark/opperf/nd_operations/sorting_searching_operators.py
@@ -29,7 +29,7 @@
 """
 
 
-def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the sorting and searching
     operators in MXNet.
 
@@ -52,5 +52,5 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', wa
     # Fetch all Random Sampling Operators
     mx_sort_search_ops = get_all_sorting_searching_operators()
     # Run benchmarks
-    mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, warmup, runs)
+    mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, warmup, runs)
     return mx_sort_search_op_results
diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py
@@ -35,7 +35,7 @@
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
 
-def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
+def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the unary
     operators in MXNet.
 
@@ -58,5 +58,5 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25,
     # Fetch all Unary Operators
     mx_unary_broadcast_ops = get_all_unary_operators()
     # Run benchmarks
-    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, warmup, runs)
+    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
     return mx_unary_op_results