From 5779fb4a22b6a9beeb00ba4d43d7275617e9fa1e Mon Sep 17 00:00:00 2001 From: Maxim Shevtsov Date: Mon, 30 Nov 2020 16:03:42 +0300 Subject: [PATCH] [MULTI] Zero-copy (when backed by the determenistic app-level scheduling) (#3286) * Optimized Infer Request Scheduling * Fixed misprint * Brushing the code and comments a bit * further brushing of the ScheduleToWorkerRequest: moving the task execution directly into the loop over devices (avoids pointers and 'else' clause) * 1) zero-copy (assuming determenistic app-level scheduling) for the multi-device, via "borrowing" the corresponding device-specific blobs and letting the app to implicitly use these 2) Initial MULTI section in the opt guide (primarily to document a tip on helping the MULTI to keep the zero-copy path) Co-authored-by: apankratovantonp --- .../dldt_optimization_guide.md | 22 ++++++++++++++++++- .../multi_device_exec_network.cpp | 16 +++++++++++++- .../multi_device_exec_network.hpp | 2 ++ .../multi_device_infer_request.cpp | 17 +++++++++++--- .../multi_device_infer_request.hpp | 5 +++-- 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/docs/optimization_guide/dldt_optimization_guide.md b/docs/optimization_guide/dldt_optimization_guide.md index 37299bdf1a22f7..73e99437ac8828 100644 --- a/docs/optimization_guide/dldt_optimization_guide.md +++ b/docs/optimization_guide/dldt_optimization_guide.md @@ -110,6 +110,26 @@ Also: The resulting IR precision, for instance, `FP16` or `FP32`, directly affects performance. As CPU now supports `FP16` (while internally upscaling to `FP32` anyway) and because this is the best precision for a GPU target, you may want to always convert models to `FP16`. Notice that this is the only precision that Intel® Movidius™ Myriad™ 2 and Intel® Myriad™ X VPUs support. +## Multi-Device Execution +OpenVINO™ toolkit supports automatic multi-device execution, please see [MULTI-Device plugin description](../IE_DG/supported_plugins/MULTI.md). +In the next chapter you can find the device-specific tips, while this section covers few recommendations +for the multi-device execution: +- MULTI usually performs best when the fastest device is specified first in the list of the devices. + This is particularly important when the parallelism is not sufficient + (e.g. the number of request in the flight is not enough to saturate all devices). +- It is highly recommended to query the optimal number of inference requests directly from the instance of the ExecutionNetwork + (resulted from the LoadNetwork call with the specific multi-device configuration as a parameter). +Please refer to the code of the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample for details. +- Notice that for example CPU+GPU execution performs better with certain knobs + which you can find in the code of the same [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample. + One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams (which is already a default for the GPU) to amortize slower + inference completion from the device to the host. +- Multi-device logic always attempts to save on the (e.g. inputs) data copies between device-agnostic, user-facing inference requests + and device-specific 'worker' requests that are being actually scheduled behind the scene. + To facilitate the copy savings, it is recommended to start the requests in the order that they were created + (with ExecutableNetwork's CreateInferRequest). + + ## Device-Specific Optimizations The Inference Engine supports several target devices (CPU, GPU, Intel® Movidius™ Myriad™ 2 VPU, Intel® Movidius™ Myriad™ X VPU, Intel® Vision Accelerator Design with Intel® Movidius™ Vision Processing Units (VPU) and FPGA), and each of them has a corresponding plugin. If you want to optimize a specific device, you must keep in mind the following tips to increase the performance. @@ -123,7 +143,7 @@ The only hint you can get from that is how the major primitives are accelerated Internally, the Inference Engine has a threading abstraction level, which allows for compiling the [open source version](https://github.com/opencv/dldt) with either Intel® Threading Building Blocks (Intel® TBB) which is now default, or OpenMP* as an alternative parallelism solution. When using inference on the CPU, this is particularly important to align threading model with the rest of your application (and any third-party libraries that you use) to avoid oversubscription. For more information, see Note on the App-Level Threading section. Since R1 2019, the OpenVINO™ toolkit comes pre-compiled with Intel TBB, - so any OpenMP* API or environment settings (like `OMP_NUM_THREADS`) has no effect anymore. + so any OpenMP* API or environment settings (like `OMP_NUM_THREADS`) has no effect. Certain tweaks (like number of threads used for inference on the CPU) are still possible via [CPU configuration options](../IE_DG/supported_plugins/CPU.md). Finally, the OpenVINO CPU inference is NUMA-aware, please refer to the Tips for inference on NUMA systems section. diff --git a/inference-engine/src/multi_device/multi_device_exec_network.cpp b/inference-engine/src/multi_device/multi_device_exec_network.cpp index de27554f514927..10b9a280963624 100644 --- a/inference-engine/src/multi_device/multi_device_exec_network.cpp +++ b/inference-engine/src/multi_device/multi_device_exec_network.cpp @@ -54,6 +54,7 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap()), _devicePriorities{networkDevices}, + _devicePrioritiesInitial{networkDevices}, _networksPerDevice{networksPerDevice}, _config{config}, _needPerfCounters{needPerfCounters} { @@ -149,7 +150,20 @@ MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() { InferenceEngine::InferRequestInternal::Ptr MultiDeviceExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs, InferenceEngine::OutputsDataMap networkOutputs) { - return std::make_shared(networkInputs, networkOutputs); + auto num = _numRequestsCreated++; + size_t sum = 0; + InferenceEngine::InferRequest request_to_share_blobs_with; + // borrowing device-specific blobs from the underlying requests for the device-agnostic, user-facing requests + // this allows to potentially save on the data-copy later (if the requests are scheduled in the same order) + for (const auto& device : _devicePrioritiesInitial) { + auto& dev_requests = _workerRequests[device.deviceName]; + if ((num - sum) < dev_requests.size()) { + request_to_share_blobs_with = dev_requests.at(num - sum)._inferRequest; + break; + } + sum += dev_requests.size(); + } + return std::make_shared(networkInputs, networkOutputs, request_to_share_blobs_with); } IInferRequest::Ptr MultiDeviceExecutableNetwork::CreateInferRequest() { diff --git a/inference-engine/src/multi_device/multi_device_exec_network.hpp b/inference-engine/src/multi_device/multi_device_exec_network.hpp index df178d130ce071..bdea1e449e4041 100644 --- a/inference-engine/src/multi_device/multi_device_exec_network.hpp +++ b/inference-engine/src/multi_device/multi_device_exec_network.hpp @@ -125,12 +125,14 @@ class MultiDeviceExecutableNetwork : public InferenceEngine::ExecutableNetworkTh std::atomic_bool _terminate = {false}; std::mutex _mutex; std::vector _devicePriorities; + const std::vector _devicePrioritiesInitial; DeviceMap _networksPerDevice; ThreadSafeQueue _inferPipelineTasks; DeviceMap _idleWorkerRequests; DeviceMap> _workerRequests; std::unordered_map _config; bool _needPerfCounters = false; + std::atomic_size_t _numRequestsCreated = {0}; }; } // namespace MultiDevicePlugin diff --git a/inference-engine/src/multi_device/multi_device_infer_request.cpp b/inference-engine/src/multi_device/multi_device_infer_request.cpp index d021e0a30624f0..a662cc711346af 100644 --- a/inference-engine/src/multi_device/multi_device_infer_request.cpp +++ b/inference-engine/src/multi_device/multi_device_infer_request.cpp @@ -10,8 +10,17 @@ namespace MultiDevicePlugin { using namespace InferenceEngine; // ------------------------------MultiDeviceInferRequest---------------------------- MultiDeviceInferRequest::MultiDeviceInferRequest(const InputsDataMap& networkInputs, - const OutputsDataMap& networkOutputs) + const OutputsDataMap& networkOutputs, + InferRequest request_to_share_blobs_with) : InferRequestInternal(networkInputs, networkOutputs) { + if (request_to_share_blobs_with) { + // borrow device-friendly blobs from the request + for (const auto &it : _networkInputs) + _inputs[it.first] = request_to_share_blobs_with.GetBlob(it.first); + for (const auto &it : _networkOutputs) + _outputs[it.first] = request_to_share_blobs_with.GetBlob(it.first); + return; + } // Allocate all input blobs for (const auto &it : networkInputs) { Layout l = it.second->getLayout(); @@ -40,14 +49,16 @@ void MultiDeviceInferRequest::SetBlobsToAnotherRequest(InferRequest& req) { auto &name = it.first; // this request is already in BUSY state, so using the internal functions safely GetBlob(name.c_str(), blob); - req.SetBlob(name.c_str(), blob); + if (req.GetBlob(name) != blob) + req.SetBlob(name, blob); } for (const auto &it : _networkOutputs) { Blob::Ptr blob; auto &name = it.first; // this request is already in BUSY state, so using the internal functions safely GetBlob(name.c_str(), blob); - req.SetBlob(name.c_str(), blob); + if (req.GetBlob(name) != blob) + req.SetBlob(name, blob); } } diff --git a/inference-engine/src/multi_device/multi_device_infer_request.hpp b/inference-engine/src/multi_device/multi_device_infer_request.hpp index aebeb6784f6106..80270cd117c294 100644 --- a/inference-engine/src/multi_device/multi_device_infer_request.hpp +++ b/inference-engine/src/multi_device/multi_device_infer_request.hpp @@ -23,14 +23,15 @@ class MultiDeviceInferRequest : public InferenceEngine::InferRequestInternal { public: using Ptr = std::shared_ptr; explicit MultiDeviceInferRequest(const InferenceEngine::InputsDataMap& networkInputs, - const InferenceEngine::OutputsDataMap& networkOutputs); + const InferenceEngine::OutputsDataMap& networkOutputs, + InferenceEngine::InferRequest request_to_share_blobs_with); void GetPerformanceCounts(std::map&) const override { THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str; } void InferImpl() override { THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str; } - // Multi-Device impl specific: sets the data (blobs from the device-less requets to the specific device request) + // Multi-Device impl specific: sets the data (blobs from the device-less requests to the specific device request) void SetBlobsToAnotherRequest(InferenceEngine::InferRequest& req); };