Skip to content

Commit

Permalink
Intoduce -latency_percentile flag for the benchmark_app tool (openvin…
Browse files Browse the repository at this point in the history
…otoolkit#6479)

* Introduce new -latency_percentile flag for benchmark_app

* Fix syntax
  • Loading branch information
ivankochin authored and rnugmanx committed Aug 26, 2021
1 parent b347144 commit c64a67e
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 13 deletions.
1 change: 1 addition & 0 deletions inference-engine/samples/benchmark_app/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ Options:
-layout Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
-cache_dir "<path>" Optional. Enables caching of loaded models to specified directory.
-load_from_file Optional. Loads model from file directly without ReadNetwork.
-latency_percentile Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).
CPU-specific performance options:
-nstreams "<integer>" Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices
Expand Down
8 changes: 8 additions & 0 deletions inference-engine/samples/benchmark_app/benchmark_app.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ static const char infer_num_streams_message[] = "Optional. Number of streams to
"Also, using nstreams>1 is inherently throughput-oriented option, "
"while for the best-latency estimations the number of streams should be set to 1.";

/// @brief message for latency percentile settings
static const char infer_latency_percentile_message[] =
"Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).";

/// @brief message for enforcing of BF16 execution where it is possible
static const char enforce_bf16_message[] = "Optional. By default floating point operations execution in bfloat16 precision are enforced "
"if supported by platform.\n"
Expand Down Expand Up @@ -189,6 +193,9 @@ DEFINE_uint32(nthreads, 0, infer_num_threads_message);
/// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
DEFINE_string(nstreams, "", infer_num_streams_message);

/// @brief The percentile which will be reported in latency metric
DEFINE_uint32(latency_percentile, 50, infer_latency_percentile_message);

/// @brief Enforces bf16 execution with bfloat16 precision on systems having this capability
DEFINE_bool(enforcebf16, false, enforce_bf16_message);

Expand Down Expand Up @@ -278,6 +285,7 @@ static void showUsage() {
std::cout << " -layout " << layout_message << std::endl;
std::cout << " -cache_dir \"<path>\" " << cache_dir_message << std::endl;
std::cout << " -load_from_file " << load_from_file_message << std::endl;
std::cout << " -latency_percentile " << infer_latency_percentile_message << std::endl;
std::cout << std::endl << " device-specific performance options:" << std::endl;
std::cout << " -nstreams \"<integer>\" " << infer_num_streams_message << std::endl;
std::cout << " -nthreads \"<integer>\" " << infer_num_threads_message << std::endl;
Expand Down
30 changes: 23 additions & 7 deletions inference-engine/samples/benchmark_app/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ bool ParseAndCheckCommandLine(int argc, char* argv[]) {
throw std::logic_error("Model is required but not set. Please set -m option.");
}

if (FLAGS_latency_percentile > 100 || FLAGS_latency_percentile < 1) {
showUsage();
throw std::logic_error("The percentile value is incorrect. The applicable values range is [1, 100].");
}
if (FLAGS_api != "async" && FLAGS_api != "sync") {
throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
}
Expand Down Expand Up @@ -100,11 +104,10 @@ static void next_step(const std::string additional_info = "") {
}

template <typename T>
T getMedianValue(const std::vector<T>& vec) {
T getMedianValue(const std::vector<T>& vec, std::size_t percentile) {
std::vector<T> sortedVec(vec);
std::sort(sortedVec.begin(), sortedVec.end());
return (sortedVec.size() % 2 != 0) ? sortedVec[sortedVec.size() / 2ULL]
: (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
return sortedVec[(sortedVec.size() / 100) * percentile];
}

/**
Expand Down Expand Up @@ -624,7 +627,7 @@ int main(int argc, char* argv[]) {
// wait the latest inference executions
inferRequestsQueue.waitAll();

double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
double latency = getMedianValue<double>(inferRequestsQueue.getLatencies(), FLAGS_latency_percentile);
double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency : batchSize * 1000.0 * iteration / totalDuration;

Expand All @@ -634,8 +637,14 @@ int main(int argc, char* argv[]) {
{"total number of iterations", std::to_string(iteration)},
});
if (device_name.find("MULTI") == std::string::npos) {
std::string latency_label;
if (FLAGS_latency_percentile == 50) {
latency_label = "latency (ms)";
} else {
latency_label = "latency (" + std::to_string(FLAGS_latency_percentile) + " percentile) (ms)";
}
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {
{"latency (ms)", double_to_string(latency)},
{latency_label, double_to_string(latency)},
});
}
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"throughput", double_to_string(fps)}});
Expand Down Expand Up @@ -684,8 +693,15 @@ int main(int argc, char* argv[]) {

std::cout << "Count: " << iteration << " iterations" << std::endl;
std::cout << "Duration: " << double_to_string(totalDuration) << " ms" << std::endl;
if (device_name.find("MULTI") == std::string::npos)
std::cout << "Latency: " << double_to_string(latency) << " ms" << std::endl;
if (device_name.find("MULTI") == std::string::npos) {
std::cout << "Latency";
if (FLAGS_latency_percentile == 50) {
std::cout << ": ";
} else {
std::cout << " (" << FLAGS_latency_percentile << " percentile): ";
}
std::cout << double_to_string(latency) << " ms" << std::endl;
}
std::cout << "Throughput: " << double_to_string(fps) << " FPS" << std::endl;
} catch (const std::exception& ex) {
slog::err << ex.what() << slog::endl;
Expand Down
9 changes: 6 additions & 3 deletions tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@

import os
from datetime import datetime
from statistics import median
from math import ceil
from openvino.inference_engine import IENetwork, IECore, get_version, StatusCode

from .utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
from .utils.logging import logger
from .utils.utils import get_duration_seconds
from .utils.statistics_report import StatisticsReport

def percentile(values, percent):
return values[ceil(len(values) * percent / 100) - 1]

class Benchmark:
def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None,
duration_seconds: int = None, api_type: str = 'async'):
Expand Down Expand Up @@ -98,7 +101,7 @@ def first_infer(self, exe_network):
raise Exception(f"Wait for all requests is failed with status code {status}!")
return infer_request.latency

def infer(self, exe_network, batch_size, progress_bar=None):
def infer(self, exe_network, batch_size, latency_percentile, progress_bar=None):
progress_count = 0
infer_requests = exe_network.requests

Expand Down Expand Up @@ -155,7 +158,7 @@ def infer(self, exe_network, batch_size, progress_bar=None):
for infer_request_id in in_fly:
times.append(infer_requests[infer_request_id].latency)
times.sort()
latency_ms = median(times)
latency_ms = percentile(times, latency_percentile)
fps = batch_size * 1000 / latency_ms if self.api_type == 'sync' else batch_size * iteration / total_duration_sec
if progress_bar:
progress_bar.finish()
Expand Down
13 changes: 10 additions & 3 deletions tools/benchmark_tool/openvino/tools/benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def set_throughput_streams():
[
('first inference time (ms)', duration_ms)
])
fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, progress_bar)
fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, args.latency_percentile, progress_bar)

# ------------------------------------ 11. Dumping statistics report -------------------------------------------
next_step()
Expand Down Expand Up @@ -372,9 +372,13 @@ def set_throughput_streams():
('total number of iterations', str(iteration)),
])
if MULTI_DEVICE_NAME not in device_name:
if args.latency_percentile == 50:
latency_prefix = 'latency (ms)'
else:
latency_prefix = 'latency (' + args.latency_percentile + ' percentile) (ms)'
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('latency (ms)', f'{latency_ms:.2f}'),
(latency_prefix, f'{latency_ms:.2f}'),
])

statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
Expand All @@ -388,7 +392,10 @@ def set_throughput_streams():
print(f'Count: {iteration} iterations')
print(f'Duration: {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
if MULTI_DEVICE_NAME not in device_name:
print(f'Latency: {latency_ms:.2f} ms')
if args.latency_percentile == 50:
print(f'Latency: {latency_ms:.2f} ms')
else:
print(f'Latency ({args.latency_percentile} percentile): {latency_ms:.2f} ms')
print(f'Throughput: {fps:.2f} FPS')

del exe_network
Expand Down
2 changes: 2 additions & 0 deletions tools/benchmark_tool/openvino/tools/benchmark/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ def parse_args():
'Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency '
'estimations the number of streams should be set to 1. '
'See samples README for more details.')
args.add_argument('--latency_percentile', type=int, required=False, default=50, choices=range(1,101),
help='Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).')
args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True, choices=[True, False],
help='Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform. '
'\'true\' - enable bfloat16 regardless of platform support. '
Expand Down

0 comments on commit c64a67e

Please sign in to comment.