Fix changes based on comments

Signed-off-by: Ooi, Boon Sin <[email protected]>
openvinotoolkit · Oct 30, 2024 · 1762638 · 1762638
1 parent e22a77d
commit 1762638
Show file tree

Hide file tree

Showing 8 changed files with 84 additions and 21 deletions.
diff --git a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst
@@ -245,6 +245,25 @@ There are several options for setting the number of inference iterations:
 The more iterations a model runs, the better the statistics will be for determining
 average latency and throughput.
 
+Fixed inference rate
+++++++++++++++++++++
+
+ 'If not specified, the inference will run at maximum rate based on a device capabilities.')
+
+By default, the benchmarking app will run inference at maximum ratefor a predefined duration, repeatedly
+performing inference with the model and measuring the resulting inference speed.
+There are several options for setting the number of inference iterations:
+
+* Explicitly specify the number of iterations the model runs, using the
+  ``-niter <number_of_iterations>`` option.
+* Set how much time the app runs for, using the ``-t <seconds>`` option.
+* Set both of them (execution will continue until both conditions are met).
+* If neither ``-niter`` nor ``-t`` are specified, the app will run for a
+  predefined duration that depends on the device.
+
+The more iterations a model runs, the better the statistics will be for determining
+average latency and throughput.
+
 Inputs
 ++++++++++++++++++++
 
@@ -337,7 +356,7 @@ following usage message:
             [Step 1/11] Parsing and validating input arguments
             [ INFO ] Parsing input parameters
             usage: benchmark_app.py [-h [HELP]] [-i PATHS_TO_INPUT [PATHS_TO_INPUT ...]] -m PATH_TO_MODEL [-d TARGET_DEVICE]
-                                    [-hint {throughput,cumulative_throughput,latency,none}] [-niter NUMBER_ITERATIONS] [-t TIME] [-b BATCH_SIZE] [-shape SHAPE]
+                                    [-hint {throughput,cumulative_throughput,latency,none}] [-niter NUMBER_ITERATIONS] [-irate INFERENCE_RATE] [-t TIME] [-b BATCH_SIZE] [-shape SHAPE]
                                     [-data_shape DATA_SHAPE] [-layout LAYOUT] [-extensions EXTENSIONS] [-c PATH_TO_CLDNN_CONFIG] [-cdir CACHE_DIR] [-lfile [LOAD_FROM_FILE]]
                                     [-api {sync,async}] [-nireq NUMBER_INFER_REQUESTS] [-nstreams NUMBER_STREAMS] [-inference_only [INFERENCE_ONLY]]
                                     [-infer_precision INFER_PRECISION] [-ip {bool,f16,f32,f64,i8,i16,i32,i64,u8,u16,u32,u64}]
@@ -536,6 +555,7 @@ following usage message:
                                            'none': no device performance mode will be set.
                                           Using explicit 'nstreams' or other device-specific options, please set hint to 'none'
                 -niter  <integer>             Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+                -irate  <integer>             Optional. Inference rate by frame per second. If not specified, the inference will run at maximium rate depending on a device capabilities.
                 -t                            Optional. Time in seconds to execute topology.
 
             Input shapes

diff --git a/samples/cpp/benchmark_app/benchmark_app.hpp b/samples/cpp/benchmark_app/benchmark_app.hpp
@@ -65,10 +65,12 @@ static const char cache_dir_message[] = "Optional. Enables caching of loaded mod
 static const char load_from_file_message[] = "Optional. Loads model from file directly without read_model."
                                              " All CNNNetwork options (like re-shape) will be ignored";
 
-/// @brief message for run frequency
-static const char run_frequency_message[] =
-    "Execute at a fixed frequency. Note if the targeted rate per second cannot be reached, "
-    "the benchmark would start the next run immediately, trying its best to catch up.";
+/// @brief message for inference rate
+static const char inference_rate_message[] =
+    "default 0"
+    "Execute an inference at a fixed rate. Note if the targeted execution per second cannot be reached, "
+    "the benchmark would start the next run immediately, trying its best to catch up."
+    ;
 
 /// @brief message for execution time
 static const char execution_time_message[] = "Optional. Time in seconds to execute topology.";
@@ -313,7 +315,7 @@ DEFINE_string(api, "async", api_message);
 DEFINE_uint64(nireq, 0, infer_requests_count_message);
 
 /// @brief Execute infer requests at a fixed frequency
-DEFINE_double(rfreq, 0, run_frequency_message);
+DEFINE_double(irate, 0, inference_rate_message);
 
 /// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
 DEFINE_string(nstreams, "", infer_num_streams_message);
@@ -396,7 +398,7 @@ static void show_usage() {
     std::cout << "    -hint  <performance hint> (latency or throughput or cumulative_throughput or none)   "
               << hint_message << std::endl;
     std::cout << "    -niter  <integer>             " << iterations_count_message << std::endl;
-    std::cout << "    -rfreq \"<float>\"            " << run_frequency_message << std::endl;
+    std::cout << "    -irate \"<float>\"            " << inference_rate_message << std::endl;
     std::cout << "    -t                            " << execution_time_message << std::endl;
     std::cout << std::endl;
     std::cout << "Input shapes" << std::endl;

diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp
@@ -1155,8 +1155,8 @@ int main(int argc, char* argv[]) {
             execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
             processedFramesN += batchSize;
 
-            if (FLAGS_rfreq > 0) {
-                int64_t nextRunFinishTime = 1 / FLAGS_rfreq * processedFramesN * 1.0e9;
+            if (FLAGS_irate > 0) {
+                int64_t nextRunFinishTime = 1 / FLAGS_irate * processedFramesN * 1.0e9;
                 std::this_thread::sleep_for(std::chrono::nanoseconds(nextRunFinishTime - execTime));
             }
         }

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
@@ -2,28 +2,30 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import time
 from datetime import datetime
 from math import ceil
 from openvino.runtime import Core, get_version, AsyncInferQueue
 
 from .utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
 from .utils.logging import logger
-from .utils.utils import get_duration_seconds
+from .utils.utils import get_duration_seconds, get_network_batch_size
 
 def percentile(values, percent):
     return values[ceil(len(values) * percent / 100) - 1]
 
 class Benchmark:
     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
-                 duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+                 duration_seconds: int = None, api_type: str = 'async', inference_only = None,
+                 inference_rate: int = 0):
         self.device = device
         self.core = Core()
         self.nireq = number_infer_requests if api_type == 'async' else 1
         self.niter = number_iterations
         self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
         self.api_type = api_type
         self.inference_only = inference_only
-        self.latency_groups = []
+        self.irate = inference_rate
 
     def __del__(self):
         del self.core
@@ -83,24 +85,33 @@ def first_infer(self, requests):
             requests.wait_all()
             return requests[id].latency
 
+    def inference_rate_delay(self, processed_frames, exec_time):
+        if (self.irate > 0):
+            nextRunFinishTime = 1 / self.irate * processed_frames
+            time.sleep(nextRunFinishTime - exec_time)
+
     def sync_inference(self, request, data_queue):
+        processed_frames = 0
         exec_time = 0
         iteration = 0
         times = []
         start_time = datetime.utcnow()
         while (self.niter and iteration < self.niter) or \
               (self.duration_seconds and exec_time < self.duration_seconds):
+            processed_frames += data_queue.get_next_batch_size()
             if self.inference_only == False:
                 request.set_input_tensors(data_queue.get_next_input())
             request.infer()
             times.append(request.latency)
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         return sorted(times), total_duration_sec, iteration
 
-    def async_inference_only(self, infer_queue):
+    def async_inference_only(self, infer_queue, data_queue):
+        processed_frames = 0
         exec_time = 0
         iteration = 0
         times = []
@@ -109,6 +120,7 @@ def async_inference_only(self, infer_queue):
         while (self.niter and iteration < self.niter) or \
               (self.duration_seconds and exec_time < self.duration_seconds) or \
               (iteration % self.nireq):
+            processed_frames += data_queue.get_next_batch_size()
             idle_id = infer_queue.get_idle_request_id()
             if idle_id in in_fly:
                 times.append(infer_queue[idle_id].latency)
@@ -118,6 +130,8 @@ def async_inference_only(self, infer_queue):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
+
         infer_queue.wait_all()
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         for infer_request_id in in_fly:
@@ -149,6 +163,7 @@ def async_inference_full_mode(self, infer_queue, data_queue, pcseq):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
         infer_queue.wait_all()
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
 
@@ -164,7 +179,7 @@ def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq)
             times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
             fps = len(batch_size) * iteration / total_duration_sec
         elif self.inference_only:
-            times, total_duration_sec, iteration = self.async_inference_only(requests)
+            times, total_duration_sec, iteration = self.async_inference_only(requests, data_queue)
             fps = len(batch_size) * iteration / total_duration_sec
         else:
             times, total_duration_sec, processed_frames, iteration = self.async_inference_full_mode(requests, data_queue, pcseq)

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -85,7 +85,8 @@ def is_flag_set_in_command_line(flag):
         next_step(step_id=2)
 
         benchmark = Benchmark(args.target_device, args.number_infer_requests,
-                              args.number_iterations, args.time, args.api_type, args.inference_only)
+                              args.number_iterations, args.time, args.api_type,
+                              args.inference_only, args.inference_rate)
 
         if args.extensions:
             benchmark.add_extension(path_to_extensions=args.extensions)

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -72,6 +72,9 @@ def parse_args():
     args.add_argument('-niter', '--number_iterations', type=check_positive, required=False, default=None,
                       help='Optional. Number of iterations. '
                            'If not specified, the number of iterations is calculated depending on a device.')
+    args.add_argument('-irate', '--inference_rate', type=int, required=False, default=None,
+                      help='Optional. Inference rate. '
+                           'If not specified, the inference will run at maximum rate based on a device capabilities.')
     args.add_argument('-t', '--time', type=check_positive, required=False, default=None,
                       help='Optional. Time in seconds to execute topology.')
 

diff --git a/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen b/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen
diff --git a/tools/ovc/unit_tests/moc_tf_fe/test_models/model_fp32.frozen b/tools/ovc/unit_tests/moc_tf_fe/test_models/model_fp32.frozen