Implement changes based on comments

Signed-off-by: Ooi, Boon Sin <[email protected]>
openvinotoolkit · Nov 5, 2024 · 5f5b6d2 · 5f5b6d2
1 parent 4a6ab42
commit 5f5b6d2
Show file tree

Hide file tree

Showing 8 changed files with 67 additions and 14 deletions.
diff --git a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst
@@ -245,6 +245,25 @@ There are several options for setting the number of inference iterations:
 The more iterations a model runs, the better the statistics will be for determining
 average latency and throughput.
 
+Maximum inference rate
+++++++++++++++++++++
+
+ 'If not specified, the inference will run at maximum rate based on a device capabilities.'
+
+By default, the benchmarking app will run inference at maximum rate for based on device capabilities repeatedly
+performing inference with the model and measuring the resulting inference speed.
+There are several options for setting the number of inference iterations:
+
+* Explicitly specify the number of iterations the model runs, using the
+  ``-niter <number_of_iterations>`` option.
+* Set how much time the app runs for, using the ``-t <seconds>`` option.
+* Set both of them (execution will continue until both conditions are met).
+* If neither ``-niter`` nor ``-t`` are specified, the app will run for a
+  predefined duration that depends on the device.
+
+The more iterations a model runs, the better the statistics will be for determining
+average latency and throughput.
+
 Inputs
 ++++++++++++++++++++
 
@@ -337,7 +356,7 @@ following usage message:
             [Step 1/11] Parsing and validating input arguments
             [ INFO ] Parsing input parameters
             usage: benchmark_app.py [-h [HELP]] [-i PATHS_TO_INPUT [PATHS_TO_INPUT ...]] -m PATH_TO_MODEL [-d TARGET_DEVICE]
-                                    [-hint {throughput,cumulative_throughput,latency,none}] [-niter NUMBER_ITERATIONS] [-t TIME] [-b BATCH_SIZE] [-shape SHAPE]
+                                    [-hint {throughput,cumulative_throughput,latency,none}] [-niter NUMBER_ITERATIONS] [-max_irate MAXIMUM_INFERENCE_RATE] [-t TIME] [-b BATCH_SIZE] [-shape SHAPE]
                                     [-data_shape DATA_SHAPE] [-layout LAYOUT] [-extensions EXTENSIONS] [-c PATH_TO_CLDNN_CONFIG] [-cdir CACHE_DIR] [-lfile [LOAD_FROM_FILE]]
                                     [-api {sync,async}] [-nireq NUMBER_INFER_REQUESTS] [-nstreams NUMBER_STREAMS] [-inference_only [INFERENCE_ONLY]]
                                     [-infer_precision INFER_PRECISION] [-ip {bool,f16,f32,f64,i8,i16,i32,i64,u8,u16,u32,u64}]
@@ -536,6 +555,7 @@ following usage message:
                                            'none': no device performance mode will be set.
                                           Using explicit 'nstreams' or other device-specific options, please set hint to 'none'
                 -niter  <integer>             Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+                -max_irate <float>            Optional. Maximum inference rate by frame per second. If not specified, the inference will run at maximium rate depending on a device capabilities.
                 -t                            Optional. Time in seconds to execute topology.
 
             Input shapes

diff --git a/samples/cpp/benchmark_app/benchmark_app.hpp b/samples/cpp/benchmark_app/benchmark_app.hpp
@@ -65,10 +65,12 @@ static const char cache_dir_message[] = "Optional. Enables caching of loaded mod
 static const char load_from_file_message[] = "Optional. Loads model from file directly without read_model."
                                              " All CNNNetwork options (like re-shape) will be ignored";
 
-/// @brief message for run frequency
-static const char run_frequency_message[] =
-    "Execute at a fixed frequency. Note if the targeted rate per second cannot be reached, "
-    "the benchmark would start the next run immediately, trying its best to catch up.";
+/// @brief message for maximum inference rate
+static const char maximum_inference_rate_message[] =
+    "Optional. Maximum inference rate"
+    "Limit the maximum rate of an an inference. Note if the targeted execution per second cannot be reached, "
+    "the benchmark execution at the maximum of the device capabilities."
+    ;
 
 /// @brief message for execution time
 static const char execution_time_message[] = "Optional. Time in seconds to execute topology.";
@@ -313,7 +315,7 @@ DEFINE_string(api, "async", api_message);
 DEFINE_uint64(nireq, 0, infer_requests_count_message);
 
 /// @brief Execute infer requests at a fixed frequency
-DEFINE_double(rfreq, 0, run_frequency_message);
+DEFINE_double(max_irate, 0, maximum_inference_rate_message);
 
 /// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
 DEFINE_string(nstreams, "", infer_num_streams_message);
@@ -396,7 +398,7 @@ static void show_usage() {
     std::cout << "    -hint  <performance hint> (latency or throughput or cumulative_throughput or none)   "
               << hint_message << std::endl;
     std::cout << "    -niter  <integer>             " << iterations_count_message << std::endl;
-    std::cout << "    -rfreq \"<float>\"            " << run_frequency_message << std::endl;
+    std::cout << "    -max_irate \"<float>\"        " << maximum_inference_rate_message << std::endl;
     std::cout << "    -t                            " << execution_time_message << std::endl;
     std::cout << std::endl;
     std::cout << "Input shapes" << std::endl;

diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp
@@ -1155,8 +1155,8 @@ int main(int argc, char* argv[]) {
             execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
             processedFramesN += batchSize;
 
-            if (FLAGS_rfreq > 0) {
-                int64_t nextRunFinishTime = 1 / FLAGS_rfreq * processedFramesN * 1.0e9;
+            if (FLAGS_max_irate > 0) {
+                int64_t nextRunFinishTime = 1 / FLAGS_max_irate * processedFramesN * 1.0e9;
                 std::this_thread::sleep_for(std::chrono::nanoseconds(nextRunFinishTime - execTime));
             }
         }

diff --git a/tests/samples_tests/smoke_tests/test_benchmark_app.py b/tests/samples_tests/smoke_tests/test_benchmark_app.py
@@ -38,13 +38,16 @@ def create_random_4bit_bin_file(tmp_path, shape, name):
         f.write(raw_data)
 
 
-def verify(sample_language, device, api=None, nireq=None, shape=None, data_shape=None, nstreams=None, layout=None, pin=None, cache=None, tmp_path=None, model='bvlcalexnet-12.onnx', inp='dog-224x224.bmp', batch='1', niter='10', tm=None):
+def verify(sample_language, device, api=None, nireq=None, shape=None, data_shape=None, nstreams=None,
+           layout=None, pin=None, cache=None, tmp_path=None, model='bvlcalexnet-12.onnx',
+           inp='dog-224x224.bmp', batch='1', niter='10', max_irate=None, tm=None):
     output = get_cmd_output(
         get_executable(sample_language),
         *prepend(cache, inp, model, tmp_path),
         *('-nstreams', nstreams) if nstreams else '',
         *('-layout', layout) if layout else '',
         *('-nireq', nireq) if nireq else '',
+        *('-max_irate', max_irate) if max_irate else '',
         *('-shape', shape) if shape else '',
         *('-data_shape', data_shape) if data_shape else '',
         *('-hint', 'none') if nstreams or pin else '',
@@ -84,6 +87,13 @@ def test_nireq(sample_language, api, nireq, device, cache, tmp_path):
     verify(sample_language, device, api=api, nireq=nireq, cache=cache, tmp_path=tmp_path)
 
 
+@pytest.mark.parametrize('sample_language', ['C++', 'Python'])
+@pytest.mark.parametrize('max_irate', ['', '0', '10'])
+@pytest.mark.parametrize('device', get_devices())
+def test_max_irate(sample_language, device, max_irate, cache, tmp_path):
+    verify(sample_language, device, max_irate=max_irate, cache=cache, tmp_path=tmp_path)
+
+
 @pytest.mark.skipif('CPU' not in get_devices(), reason='affinity is a CPU property')
 @pytest.mark.parametrize('sample_language', ['C++', 'Python'])
 @pytest.mark.parametrize('pin', ['YES', 'NO', 'NUMA', 'HYBRID_AWARE'])

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import time
 from datetime import datetime
 from math import ceil
 from openvino.runtime import Core, get_version, AsyncInferQueue
@@ -15,7 +16,8 @@ def percentile(values, percent):
 
 class Benchmark:
     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
-                 duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+                 duration_seconds: int = None, api_type: str = 'async', inference_only = None,
+                 maximum_inference_rate: float = 0):
         self.device = device
         self.core = Core()
         self.nireq = number_infer_requests if api_type == 'async' else 1
@@ -24,6 +26,7 @@ def __init__(self, device: str, number_infer_requests: int = 0, number_iteration
         self.api_type = api_type
         self.inference_only = inference_only
         self.latency_groups = []
+        self.max_irate = maximum_inference_rate
 
     def __del__(self):
         del self.core
@@ -83,24 +86,34 @@ def first_infer(self, requests):
             requests.wait_all()
             return requests[id].latency
 
+    def inference_rate_delay(self, processed_frames, exec_time):
+        if self.max_irate and self.max_irate > 0:
+            nextRunFinishTime = 1 / self.max_irate * processed_frames
+            delay = nextRunFinishTime - exec_time
+            time.sleep(delay if delay > 0 else 0)
+
     def sync_inference(self, request, data_queue):
+        processed_frames = 0
         exec_time = 0
         iteration = 0
         times = []
         start_time = datetime.utcnow()
         while (self.niter and iteration < self.niter) or \
               (self.duration_seconds and exec_time < self.duration_seconds):
+            processed_frames += data_queue.get_next_batch_size()
             if self.inference_only == False:
                 request.set_input_tensors(data_queue.get_next_input())
             request.infer()
             times.append(request.latency)
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         return sorted(times), total_duration_sec, iteration
 
-    def async_inference_only(self, infer_queue):
+    def async_inference_only(self, infer_queue, data_queue):
+        processed_frames = 0
         exec_time = 0
         iteration = 0
         times = []
@@ -109,6 +122,7 @@ def async_inference_only(self, infer_queue):
         while (self.niter and iteration < self.niter) or \
               (self.duration_seconds and exec_time < self.duration_seconds) or \
               (iteration % self.nireq):
+            processed_frames += data_queue.get_next_batch_size()
             idle_id = infer_queue.get_idle_request_id()
             if idle_id in in_fly:
                 times.append(infer_queue[idle_id].latency)
@@ -118,6 +132,8 @@ def async_inference_only(self, infer_queue):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
+
         infer_queue.wait_all()
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         for infer_request_id in in_fly:
@@ -149,6 +165,7 @@ def async_inference_full_mode(self, infer_queue, data_queue, pcseq):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
         infer_queue.wait_all()
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
 
@@ -164,7 +181,7 @@ def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq)
             times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
             fps = len(batch_size) * iteration / total_duration_sec
         elif self.inference_only:
-            times, total_duration_sec, iteration = self.async_inference_only(requests)
+            times, total_duration_sec, iteration = self.async_inference_only(requests, data_queue)
             fps = len(batch_size) * iteration / total_duration_sec
         else:
             times, total_duration_sec, processed_frames, iteration = self.async_inference_full_mode(requests, data_queue, pcseq)

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -85,7 +85,8 @@ def is_flag_set_in_command_line(flag):
         next_step(step_id=2)
 
         benchmark = Benchmark(args.target_device, args.number_infer_requests,
-                              args.number_iterations, args.time, args.api_type, args.inference_only)
+                              args.number_iterations, args.time, args.api_type,
+                              args.inference_only, args.maximum_inference_rate)
 
         if args.extensions:
             benchmark.add_extension(path_to_extensions=args.extensions)

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -72,6 +72,9 @@ def parse_args():
     args.add_argument('-niter', '--number_iterations', type=check_positive, required=False, default=None,
                       help='Optional. Number of iterations. '
                            'If not specified, the number of iterations is calculated depending on a device.')
+    args.add_argument('-max_irate', '--maximum_inference_rate', type=float, required=False, default=None,
+                      help='Optional. Maximum inference rate by frame per second. '
+                           'If not specified, the inference will run at maximum rate based on a device capabilities.')
     args.add_argument('-t', '--time', type=check_positive, required=False, default=None,
                       help='Optional. Time in seconds to execute topology.')
 

diff --git a/tools/benchmark_tool/setup.py b/tools/benchmark_tool/setup.py