Intoduce -latency_percentile flag for the benchmark_app tool (openvin…

…otoolkit#6479) * Introduce new -latency_percentile flag for benchmark_app * Fix syntax
rnugmanx · Aug 26, 2021 · c64a67e · c64a67e
1 parent b347144
commit c64a67e
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 13 deletions.
diff --git a/inference-engine/samples/benchmark_app/README.md b/inference-engine/samples/benchmark_app/README.md
@@ -95,6 +95,7 @@ Options:
     -layout                     Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
     -cache_dir "<path>"         Optional. Enables caching of loaded models to specified directory.
     -load_from_file             Optional. Loads model from file directly without ReadNetwork.
+    -latency_percentile         Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).
 
   CPU-specific performance options:
     -nstreams "<integer>"       Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices

diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@@ -56,6 +56,10 @@ static const char infer_num_streams_message[] = "Optional. Number of streams to
                                                 "Also, using nstreams>1 is inherently throughput-oriented option, "
                                                 "while for the best-latency estimations the number of streams should be set to 1.";
 
+/// @brief message for latency percentile settings
+static const char infer_latency_percentile_message[] =
+    "Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).";
+
 /// @brief message for enforcing of BF16 execution where it is possible
 static const char enforce_bf16_message[] = "Optional. By default floating point operations execution in bfloat16 precision are enforced "
                                            "if supported by platform.\n"
@@ -189,6 +193,9 @@ DEFINE_uint32(nthreads, 0, infer_num_threads_message);
 /// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
 DEFINE_string(nstreams, "", infer_num_streams_message);
 
+/// @brief The percentile which will be reported in latency metric
+DEFINE_uint32(latency_percentile, 50, infer_latency_percentile_message);
+
 /// @brief Enforces bf16 execution with bfloat16 precision on systems having this capability
 DEFINE_bool(enforcebf16, false, enforce_bf16_message);
 
@@ -278,6 +285,7 @@ static void showUsage() {
     std::cout << "    -layout                   " << layout_message << std::endl;
     std::cout << "    -cache_dir \"<path>\"        " << cache_dir_message << std::endl;
     std::cout << "    -load_from_file           " << load_from_file_message << std::endl;
+    std::cout << "    -latency_percentile       " << infer_latency_percentile_message << std::endl;
     std::cout << std::endl << "  device-specific performance options:" << std::endl;
     std::cout << "    -nstreams \"<integer>\"     " << infer_num_streams_message << std::endl;
     std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;

diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp
@@ -52,6 +52,10 @@ bool ParseAndCheckCommandLine(int argc, char* argv[]) {
         throw std::logic_error("Model is required but not set. Please set -m option.");
     }
 
+    if (FLAGS_latency_percentile > 100 || FLAGS_latency_percentile < 1) {
+        showUsage();
+        throw std::logic_error("The percentile value is incorrect. The applicable values range is [1, 100].");
+    }
     if (FLAGS_api != "async" && FLAGS_api != "sync") {
         throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
     }
@@ -100,11 +104,10 @@ static void next_step(const std::string additional_info = "") {
 }
 
 template <typename T>
-T getMedianValue(const std::vector<T>& vec) {
+T getMedianValue(const std::vector<T>& vec, std::size_t percentile) {
     std::vector<T> sortedVec(vec);
     std::sort(sortedVec.begin(), sortedVec.end());
-    return (sortedVec.size() % 2 != 0) ? sortedVec[sortedVec.size() / 2ULL]
-                                       : (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
+    return sortedVec[(sortedVec.size() / 100) * percentile];
 }
 
 /**
@@ -624,7 +627,7 @@ int main(int argc, char* argv[]) {
         // wait the latest inference executions
         inferRequestsQueue.waitAll();
 
-        double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
+        double latency = getMedianValue<double>(inferRequestsQueue.getLatencies(), FLAGS_latency_percentile);
         double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
         double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency : batchSize * 1000.0 * iteration / totalDuration;
 
@@ -634,8 +637,14 @@ int main(int argc, char* argv[]) {
                                                                                          {"total number of iterations", std::to_string(iteration)},
                                                                                      });
             if (device_name.find("MULTI") == std::string::npos) {
+                std::string latency_label;
+                if (FLAGS_latency_percentile == 50) {
+                    latency_label = "latency (ms)";
+                } else {
+                    latency_label = "latency (" + std::to_string(FLAGS_latency_percentile) + " percentile) (ms)";
+                }
                 statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {
-                                                                                             {"latency (ms)", double_to_string(latency)},
+                                                                                             {latency_label, double_to_string(latency)},
                                                                                          });
             }
             statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"throughput", double_to_string(fps)}});
@@ -684,8 +693,15 @@ int main(int argc, char* argv[]) {
 
         std::cout << "Count:      " << iteration << " iterations" << std::endl;
         std::cout << "Duration:   " << double_to_string(totalDuration) << " ms" << std::endl;
-        if (device_name.find("MULTI") == std::string::npos)
-            std::cout << "Latency:    " << double_to_string(latency) << " ms" << std::endl;
+        if (device_name.find("MULTI") == std::string::npos) {
+            std::cout << "Latency";
+            if (FLAGS_latency_percentile == 50) {
+                std::cout << ":    ";
+            } else {
+                std::cout << " (" << FLAGS_latency_percentile << " percentile):    ";
+            }
+            std::cout << double_to_string(latency) << " ms" << std::endl;
+        }
         std::cout << "Throughput: " << double_to_string(fps) << " FPS" << std::endl;
     } catch (const std::exception& ex) {
         slog::err << ex.what() << slog::endl;

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
@@ -3,14 +3,17 @@
 
 import os
 from datetime import datetime
-from statistics import median
+from math import ceil
 from openvino.inference_engine import IENetwork, IECore, get_version, StatusCode
 
 from .utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
 from .utils.logging import logger
 from .utils.utils import get_duration_seconds
 from .utils.statistics_report import StatisticsReport
 
+def percentile(values, percent):
+    return values[ceil(len(values) * percent / 100) - 1]
+
 class Benchmark:
     def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None,
                  duration_seconds: int = None, api_type: str = 'async'):
@@ -98,7 +101,7 @@ def first_infer(self, exe_network):
                 raise Exception(f"Wait for all requests is failed with status code {status}!")
         return infer_request.latency
 
-    def infer(self, exe_network, batch_size, progress_bar=None):
+    def infer(self, exe_network, batch_size, latency_percentile, progress_bar=None):
         progress_count = 0
         infer_requests = exe_network.requests
 
@@ -155,7 +158,7 @@ def infer(self, exe_network, batch_size, progress_bar=None):
         for infer_request_id in in_fly:
             times.append(infer_requests[infer_request_id].latency)
         times.sort()
-        latency_ms = median(times)
+        latency_ms = percentile(times, latency_percentile)
         fps = batch_size * 1000 / latency_ms if self.api_type == 'sync' else batch_size * iteration / total_duration_sec
         if progress_bar:
             progress_bar.finish()

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -344,7 +344,7 @@ def set_throughput_streams():
                                     [
                                         ('first inference time (ms)', duration_ms)
                                     ])
-        fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, progress_bar)
+        fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, args.latency_percentile, progress_bar)
 
         # ------------------------------------ 11. Dumping statistics report -------------------------------------------
         next_step()
@@ -372,9 +372,13 @@ def set_throughput_streams():
                                           ('total number of iterations', str(iteration)),
                                       ])
             if MULTI_DEVICE_NAME not in device_name:
+                if args.latency_percentile == 50:
+                    latency_prefix = 'latency (ms)'
+                else:
+                    latency_prefix = 'latency (' + args.latency_percentile + ' percentile) (ms)'
                 statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                           [
-                                              ('latency (ms)', f'{latency_ms:.2f}'),
+                                              (latency_prefix, f'{latency_ms:.2f}'),
                                           ])
 
             statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
@@ -388,7 +392,10 @@ def set_throughput_streams():
         print(f'Count:      {iteration} iterations')
         print(f'Duration:   {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
         if MULTI_DEVICE_NAME not in device_name:
-            print(f'Latency:    {latency_ms:.2f} ms')
+            if args.latency_percentile == 50:
+                print(f'Latency:    {latency_ms:.2f} ms')
+            else:
+                print(f'Latency ({args.latency_percentile} percentile):    {latency_ms:.2f} ms')
         print(f'Throughput: {fps:.2f} FPS')
 
         del exe_network

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -84,6 +84,8 @@ def parse_args():
                            'Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency '
                            'estimations the number of streams should be set to 1. '
                            'See samples README for more details.')
+    args.add_argument('--latency_percentile', type=int, required=False, default=50, choices=range(1,101),
+                      help='Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).')
     args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True, choices=[True, False],
                       help='Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform. '
                            '\'true\'  - enable  bfloat16 regardless of platform support. '