From 5779fb4a22b6a9beeb00ba4d43d7275617e9fa1e Mon Sep 17 00:00:00 2001
From: Maxim Shevtsov <maxim.y.shevtsov@intel.com>
Date: Mon, 30 Nov 2020 16:03:42 +0300
Subject: [PATCH] [MULTI] Zero-copy (when backed by the determenistic app-level
 scheduling) (#3286)

* Optimized Infer Request Scheduling

* Fixed misprint

* Brushing the code and comments a bit

* further brushing of the ScheduleToWorkerRequest: moving the task execution directly into the loop over devices (avoids pointers and 'else' clause)

* 1) zero-copy (assuming determenistic app-level scheduling) for the multi-device, via "borrowing" the corresponding device-specific blobs and letting the app to implicitly use these

2) Initial MULTI section in the opt guide (primarily to document a tip on helping the MULTI to keep the zero-copy path)

Co-authored-by: apankratovantonp <anton.pankratov@intel.com>
---
 .../dldt_optimization_guide.md                | 22 ++++++++++++++++++-
 .../multi_device_exec_network.cpp             | 16 +++++++++++++-
 .../multi_device_exec_network.hpp             |  2 ++
 .../multi_device_infer_request.cpp            | 17 +++++++++++---
 .../multi_device_infer_request.hpp            |  5 +++--
 5 files changed, 55 insertions(+), 7 deletions(-)
diff --git a/docs/optimization_guide/dldt_optimization_guide.md b/docs/optimization_guide/dldt_optimization_guide.md
index 37299bdf1a22f7..73e99437ac8828 100644
--- a/docs/optimization_guide/dldt_optimization_guide.md
+++ b/docs/optimization_guide/dldt_optimization_guide.md
@@ -110,6 +110,26 @@ Also:
 The resulting IR precision, for instance, `FP16` or `FP32`, directly affects performance. As CPU now supports `FP16` (while internally upscaling to `FP32` anyway) and because this is the best precision for a GPU target, you may want to always convert models to `FP16`. Notice that this is the only precision that Intel&reg; Movidius&trade; Myriad&trade; 2 and Intel&reg; Myriad&trade; X VPUs support.
 
 
+## Multi-Device Execution <a name="multi-device-optimizations"></a>
+OpenVINO&trade; toolkit supports automatic multi-device execution, please see [MULTI-Device plugin description](../IE_DG/supported_plugins/MULTI.md).
+In the next chapter you can find the device-specific tips, while this section covers few recommendations 
+for the multi-device execution:
+-	MULTI usually performs best when the fastest device is specified first in the list of the devices. 
+    This is particularly important when the parallelism is not sufficient 
+    (e.g. the number of request in the flight is not enough to saturate all devices).
+- It is highly recommended to query the optimal number of inference requests directly from the instance of the ExecutionNetwork 
+  (resulted from the LoadNetwork call with the specific multi-device configuration as a parameter). 
+Please refer to the code of the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample for details.    
+-   Notice that for example CPU+GPU execution performs better with certain knobs 
+    which you can find in the code of the same [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample.
+    One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams (which is already a default for the GPU) to amortize slower 
+    inference completion from the device to the host.
+-	Multi-device logic always attempts to save on the (e.g. inputs) data copies between device-agnostic, user-facing inference requests 
+    and device-specific 'worker' requests that are being actually scheduled behind the scene. 
+    To facilitate the copy savings, it is recommended to start the requests in the order that they were created 
+    (with ExecutableNetwork's CreateInferRequest).
+  
+
 ## Device-Specific Optimizations <a name="device-specific-optimizations"></a>
 
 The Inference Engine supports several target devices (CPU, GPU, Intel&reg; Movidius&trade; Myriad&trade; 2 VPU, Intel&reg; Movidius&trade; Myriad&trade; X VPU, Intel® Vision Accelerator Design with Intel® Movidius™ Vision Processing Units (VPU) and FPGA), and each of them has a corresponding plugin. If you want to optimize a specific device, you must keep in mind the following tips to increase the performance.
@@ -123,7 +143,7 @@ The only hint you can get from that is how the major primitives are accelerated
 Internally, the Inference Engine has a threading abstraction level, which allows for compiling the [open source version](https://github.com/opencv/dldt) with either Intel&reg; Threading Building Blocks (Intel&reg; TBB) which is now default, or OpenMP* as an alternative parallelism solution. When using inference on the CPU, this is particularly important to align threading model with the rest of your application (and any third-party libraries that you use) to avoid oversubscription. For more information, see <a href="#note-on-app-level-threading">Note on the App-Level Threading</a> section.
 
  Since R1 2019, the OpenVINO&trade; toolkit comes pre-compiled with Intel TBB,
- so any  OpenMP* API or environment settings (like `OMP_NUM_THREADS`) has no effect anymore.
+ so any  OpenMP* API or environment settings (like `OMP_NUM_THREADS`) has no effect.
  Certain tweaks (like number of threads used for inference on the CPU) are still possible via  [CPU configuration options](../IE_DG/supported_plugins/CPU.md).
  Finally, the OpenVINO CPU inference is NUMA-aware, please refer to the <a href="#note-on-numa">Tips for inference on NUMA systems</a> section.
 
diff --git a/inference-engine/src/multi_device/multi_device_exec_network.cpp b/inference-engine/src/multi_device/multi_device_exec_network.cpp
index de27554f514927..10b9a280963624 100644
--- a/inference-engine/src/multi_device/multi_device_exec_network.cpp
+++ b/inference-engine/src/multi_device/multi_device_exec_network.cpp
@@ -54,6 +54,7 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap<Infer
                                                            const bool                                                           needPerfCounters) :
     InferenceEngine::ExecutableNetworkThreadSafeDefault(nullptr, std::make_shared<InferenceEngine::ImmediateExecutor>()),
     _devicePriorities{networkDevices},
+    _devicePrioritiesInitial{networkDevices},
     _networksPerDevice{networksPerDevice},
     _config{config},
     _needPerfCounters{needPerfCounters} {
@@ -149,7 +150,20 @@ MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() {
 
 InferenceEngine::InferRequestInternal::Ptr MultiDeviceExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                                                 InferenceEngine::OutputsDataMap networkOutputs) {
-    return std::make_shared<MultiDeviceInferRequest>(networkInputs, networkOutputs);
+    auto num = _numRequestsCreated++;
+    size_t sum = 0;
+    InferenceEngine::InferRequest request_to_share_blobs_with;
+    // borrowing device-specific blobs from the underlying requests for the device-agnostic, user-facing requests
+    // this allows to potentially save on the data-copy later (if the requests are scheduled in the same order)
+    for (const auto& device : _devicePrioritiesInitial) {
+        auto& dev_requests = _workerRequests[device.deviceName];
+        if ((num - sum) < dev_requests.size()) {
+            request_to_share_blobs_with = dev_requests.at(num - sum)._inferRequest;
+            break;
+        }
+        sum += dev_requests.size();
+    }
+    return std::make_shared<MultiDeviceInferRequest>(networkInputs, networkOutputs, request_to_share_blobs_with);
 }
 
 IInferRequest::Ptr MultiDeviceExecutableNetwork::CreateInferRequest() {
diff --git a/inference-engine/src/multi_device/multi_device_exec_network.hpp b/inference-engine/src/multi_device/multi_device_exec_network.hpp
index df178d130ce071..bdea1e449e4041 100644
--- a/inference-engine/src/multi_device/multi_device_exec_network.hpp
+++ b/inference-engine/src/multi_device/multi_device_exec_network.hpp
@@ -125,12 +125,14 @@ class MultiDeviceExecutableNetwork : public InferenceEngine::ExecutableNetworkTh
     std::atomic_bool                                            _terminate = {false};
     std::mutex                                                  _mutex;
     std::vector<DeviceInformation>                              _devicePriorities;
+    const std::vector<DeviceInformation>                        _devicePrioritiesInitial;
     DeviceMap<InferenceEngine::ExecutableNetwork>               _networksPerDevice;
     ThreadSafeQueue<InferenceEngine::Task>                      _inferPipelineTasks;
     DeviceMap<NotBusyWorkerRequests>                            _idleWorkerRequests;
     DeviceMap<std::vector<WorkerInferRequest>>                  _workerRequests;
     std::unordered_map<std::string, InferenceEngine::Parameter> _config;
     bool                                                        _needPerfCounters = false;
+    std::atomic_size_t                                          _numRequestsCreated = {0};
 };
 
 }  // namespace MultiDevicePlugin
diff --git a/inference-engine/src/multi_device/multi_device_infer_request.cpp b/inference-engine/src/multi_device/multi_device_infer_request.cpp
index d021e0a30624f0..a662cc711346af 100644
--- a/inference-engine/src/multi_device/multi_device_infer_request.cpp
+++ b/inference-engine/src/multi_device/multi_device_infer_request.cpp
@@ -10,8 +10,17 @@ namespace MultiDevicePlugin {
     using namespace InferenceEngine;
 // ------------------------------MultiDeviceInferRequest----------------------------
 MultiDeviceInferRequest::MultiDeviceInferRequest(const InputsDataMap&   networkInputs,
-                                                 const OutputsDataMap&  networkOutputs)
+                                                 const OutputsDataMap&  networkOutputs,
+                                                 InferRequest request_to_share_blobs_with)
         : InferRequestInternal(networkInputs, networkOutputs) {
+    if (request_to_share_blobs_with) {
+        // borrow device-friendly blobs from the request
+        for (const auto &it : _networkInputs)
+            _inputs[it.first] = request_to_share_blobs_with.GetBlob(it.first);
+        for (const auto &it : _networkOutputs)
+            _outputs[it.first] = request_to_share_blobs_with.GetBlob(it.first);
+        return;
+    }
     // Allocate all input blobs
     for (const auto &it : networkInputs) {
         Layout l = it.second->getLayout();
@@ -40,14 +49,16 @@ void MultiDeviceInferRequest::SetBlobsToAnotherRequest(InferRequest& req) {
         auto &name = it.first;
         // this request is already in BUSY state, so using the internal functions safely
         GetBlob(name.c_str(), blob);
-        req.SetBlob(name.c_str(), blob);
+        if (req.GetBlob(name) != blob)
+            req.SetBlob(name, blob);
     }
     for (const auto &it : _networkOutputs) {
         Blob::Ptr blob;
         auto &name = it.first;
         // this request is already in BUSY state, so using the internal functions safely
         GetBlob(name.c_str(), blob);
-        req.SetBlob(name.c_str(), blob);
+        if (req.GetBlob(name) != blob)
+            req.SetBlob(name, blob);
     }
 }
 
diff --git a/inference-engine/src/multi_device/multi_device_infer_request.hpp b/inference-engine/src/multi_device/multi_device_infer_request.hpp
index aebeb6784f6106..80270cd117c294 100644
--- a/inference-engine/src/multi_device/multi_device_infer_request.hpp
+++ b/inference-engine/src/multi_device/multi_device_infer_request.hpp
@@ -23,14 +23,15 @@ class MultiDeviceInferRequest : public InferenceEngine::InferRequestInternal {
 public:
     using Ptr = std::shared_ptr<MultiDeviceInferRequest>;
     explicit MultiDeviceInferRequest(const InferenceEngine::InputsDataMap&  networkInputs,
-                                     const InferenceEngine::OutputsDataMap& networkOutputs);
+                                     const InferenceEngine::OutputsDataMap& networkOutputs,
+                                     InferenceEngine::InferRequest request_to_share_blobs_with);
     void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>&) const override {
         THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
     }
     void InferImpl() override {
         THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
     }
-    // Multi-Device impl specific: sets the data (blobs from the device-less requets to the specific device request)
+    // Multi-Device impl specific: sets the data (blobs from the device-less requests to the specific device request)
     void SetBlobsToAnotherRequest(InferenceEngine::InferRequest& req);
 };