zero-copy (assuming determenistic app-level scheduling) for the multi…

…-device, via "borrowing" the corresponding device-specific blobs and letting the app to implicitly use these
openvinotoolkit · Nov 23, 2020 · 3ffa35c · 3ffa35c
1 parent 7e23055
commit 3ffa35c
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 8 deletions.
diff --git a/inference-engine/src/multi_device/multi_device_exec_network.cpp b/inference-engine/src/multi_device/multi_device_exec_network.cpp
@@ -55,6 +55,7 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap<Infer
                                                            const bool                                                           needPerfCounters) :
     InferenceEngine::ExecutableNetworkThreadSafeDefault(nullptr, std::make_shared<InferenceEngine::ImmediateExecutor>()),
     _devicePriorities{networkDevices},
+    _devicePrioritiesInitial{networkDevices},
     _networksPerDevice{networksPerDevice},
     _config{config},
     _needPerfCounters{needPerfCounters} {
@@ -92,7 +93,8 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap<Infer
                         auto capturedTask = std::move(workerRequestPtr->_task);
                         capturedTask();
                     }
-                    if (!_terminate) {
+                    // check for termination status and there work availability before triggering the scheduling logic
+                    if (!_terminate && !_inferPipelineTasks.empty()) {
                         idleGuard.Release()->push(workerRequestPtr);
                         ScheduleToWorkerInferRequest();
                     }
@@ -143,7 +145,20 @@ MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() {
 
 InferenceEngine::InferRequestInternal::Ptr MultiDeviceExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                                                 InferenceEngine::OutputsDataMap networkOutputs) {
-    return std::make_shared<MultiDeviceInferRequest>(networkInputs, networkOutputs);
+    auto num = _numRequestsCreated++;
+    size_t sum = 0;
+    InferenceEngine::InferRequest request_to_share_blobs_with;
+    // borrowing device-specific blobs from the underlying requests for the device-agnostic, user-facing requests
+    // this allows to potentially save on the data-copy later (if the requests are scheduled in the same order)
+    for (const auto& device : _devicePrioritiesInitial) {
+        auto& dev_requests = _workerRequests[device.deviceName];
+        if ((num - sum) < dev_requests.size()) {
+            request_to_share_blobs_with = dev_requests.at(num - sum)._inferRequest;
+            break;
+        }
+        sum += dev_requests.size();
+    }
+    return std::make_shared<MultiDeviceInferRequest>(networkInputs, networkOutputs, request_to_share_blobs_with);
 }
 
 IInferRequest::Ptr MultiDeviceExecutableNetwork::CreateInferRequest() {

diff --git a/inference-engine/src/multi_device/multi_device_exec_network.hpp b/inference-engine/src/multi_device/multi_device_exec_network.hpp
@@ -99,12 +99,14 @@ class MultiDeviceExecutableNetwork : public InferenceEngine::ExecutableNetworkTh
     std::atomic_bool                                            _terminate = {false};
     std::mutex                                                  _mutex;
     std::vector<DeviceInformation>                              _devicePriorities;
+    const std::vector<DeviceInformation>                        _devicePrioritiesInitial;
     DeviceMap<InferenceEngine::ExecutableNetwork>               _networksPerDevice;
     ThreadSafeQueue<InferenceEngine::Task>                      _inferPipelineTasks;
     DeviceMap<NotBusyWorkerRequests>                            _idleWorkerRequests;
     DeviceMap<std::vector<WorkerInferRequest>>                  _workerRequests;
     std::unordered_map<std::string, InferenceEngine::Parameter> _config;
     bool                                                        _needPerfCounters = false;
+    std::atomic_size_t                                          _numRequestsCreated = {0};
 };
 
 }  // namespace MultiDevicePlugin
diff --git a/inference-engine/src/multi_device/multi_device_infer_request.cpp b/inference-engine/src/multi_device/multi_device_infer_request.cpp
@@ -10,8 +10,17 @@ namespace MultiDevicePlugin {
     using namespace InferenceEngine;
 // ------------------------------MultiDeviceInferRequest----------------------------
 MultiDeviceInferRequest::MultiDeviceInferRequest(const InputsDataMap&   networkInputs,
-                                                 const OutputsDataMap&  networkOutputs)
+                                                 const OutputsDataMap&  networkOutputs,
+                                                 InferRequest request_to_share_blobs_with)
         : InferRequestInternal(networkInputs, networkOutputs) {
+    if (request_to_share_blobs_with) {
+        // borrow device-friendly blobs from the request
+        for (const auto &it : _networkInputs)
+            _inputs[it.first] = request_to_share_blobs_with.GetBlob(it.first);
+        for (const auto &it : _networkOutputs)
+            _outputs[it.first] = request_to_share_blobs_with.GetBlob(it.first);
+        return;
+    }
     // Allocate all input blobs
     for (const auto &it : networkInputs) {
         Layout l = it.second->getLayout();
@@ -40,14 +49,16 @@ void MultiDeviceInferRequest::SetBlobsToAnotherRequest(InferRequest& req) {
         auto &name = it.first;
         // this request is already in BUSY state, so using the internal functions safely
         GetBlob(name.c_str(), blob);
-        req.SetBlob(name.c_str(), blob);
+        if (req.GetBlob(name) != blob)
+            req.SetBlob(name, blob);
     }
     for (const auto &it : _networkOutputs) {
         Blob::Ptr blob;
-        auto &name = it.first;
+        auto& name = it.first;
         // this request is already in BUSY state, so using the internal functions safely
         GetBlob(name.c_str(), blob);
-        req.SetBlob(name.c_str(), blob);
+        if (req.GetBlob(name) != blob)
+            req.SetBlob(name, blob);
     }
 }
 

diff --git a/inference-engine/src/multi_device/multi_device_infer_request.hpp b/inference-engine/src/multi_device/multi_device_infer_request.hpp
@@ -23,14 +23,15 @@ class MultiDeviceInferRequest : public InferenceEngine::InferRequestInternal {
 public:
     using Ptr = std::shared_ptr<MultiDeviceInferRequest>;
     explicit MultiDeviceInferRequest(const InferenceEngine::InputsDataMap&  networkInputs,
-                                     const InferenceEngine::OutputsDataMap& networkOutputs);
+                                     const InferenceEngine::OutputsDataMap& networkOutputs,
+                                     InferenceEngine::InferRequest request_to_share_blobs_with);
     void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>&) const override {
         THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
     }
     void InferImpl() override {
         THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
     }
-    // Multi-Device impl specific: sets the data (blobs from the device-less requets to the specific device request)
+    // Multi-Device impl specific: sets the data (blobs from the device-less requests to the specific device request)
     void SetBlobsToAnotherRequest(InferenceEngine::InferRequest& req);
 };