From f2f83f8177c7edfed3afb99c11a8ca9e668e7352 Mon Sep 17 00:00:00 2001 From: Andrew Park Date: Thu, 19 May 2022 22:32:18 +0900 Subject: [PATCH] Enable DynamicBatch related logics Signed-off-by: Andrew Park --- .../intel_gpu/plugin/infer_request.hpp | 37 +- .../intel_gpu/plugin/infer_request_legacy.hpp | 10 +- .../intel_gpu/src/plugin/infer_request.cpp | 421 +++++++++--------- .../src/plugin/infer_request_legacy.cpp | 14 +- src/plugins/intel_gpu/src/plugin/program.cpp | 10 + 5 files changed, 247 insertions(+), 245 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp index 786ee849f3174f..3f554fb7d23e02 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp @@ -16,11 +16,10 @@ namespace ov { namespace runtime { namespace intel_gpu { -// TODO(Andrew): Enable below to support dynamic batch -// struct buf_info { -// size_t buf_offset; -// size_t buf_size; -// }; +struct buf_info { + size_t buf_offset; + size_t buf_size; +}; class CompiledModel; @@ -48,8 +47,7 @@ class InferRequest : public InferenceEngine::IInferRequestInternal { void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr &data) override; void SetBlobs(const std::string& name, const std::vector &data) override; - // TODO(Andrew): Enable below to support dynamic batch - // void SetBatch(int batch = -1) override; + void SetBatch(int batch = -1) override; void SetGraph(std::shared_ptr graph); void EnableProfiling() { m_useProfiling = true; } void EnableStreams() { m_useStreams = true; } @@ -63,10 +61,9 @@ class InferRequest : public InferenceEngine::IInferRequestInternal { void enqueue(); void wait(); - // TODO(Andrew): Enable below to support dynamic batch - // void preprocess_dynamic(); - // void enqueue_dynamic(); - // void wait_dynamic(); + void preprocess_dynamic(); + void enqueue_dynamic(); + void wait_dynamic(); bool use_external_queue() const { return m_useExternalQueue; } void enable_external_queue() { m_useExternalQueue = true; } @@ -83,10 +80,9 @@ class InferRequest : public InferenceEngine::IInferRequestInternal { bool m_useExternalQueue = false; std::shared_ptr m_graph; - // TODO(Andrew): Enable below to support dynamic batch // dynamic batch stuff - // std::map> batchInputs; - // std::map> batchOutputs; + std::map> batchInputs; + std::map> batchOutputs; InferenceEngine::IStreamsExecutor* streamExecutor = nullptr; void prepare_input(const cldnn::primitive_id &inputName, InferenceEngine::Blob::Ptr &inputBlob, @@ -97,22 +93,21 @@ class InferRequest : public InferenceEngine::IInferRequestInternal { std::shared_ptr alloc = nullptr); InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc); - void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr); + void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); void copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, - const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob); + const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob, + buf_info* bi = nullptr); InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem); void allocate_inputs(); void allocate_outputs(); - // TODO(Andrew): Enable below to support dynamic batch - // void allocate_inputs_dynamic(); - // void allocate_outputs_dynamic(); + void allocate_inputs_dynamic(); + void allocate_outputs_dynamic(); InferenceEngine::Blob::Ptr reinterpret_device_blob(InferenceEngine::Blob::Ptr data, const InferenceEngine::TensorDesc& new_desc); std::map internal_outputs; - // TODO(Andrew): Enable below to support dynamic batch - // std::vector> internal_outputs_dynamic; + std::vector> internal_outputs_dynamic; }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp index 833e868fda20db..2f31c645a35a6f 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp @@ -16,7 +16,7 @@ namespace ov { namespace runtime { namespace intel_gpu { -struct buf_info { +struct buf_info_legacy { size_t buf_offset; size_t buf_size; }; @@ -81,8 +81,8 @@ class InferRequestLegacy : public InferenceEngine::IInferRequestInternal { std::shared_ptr m_graph; // dynamic batch stuff - std::map> batchInputs; - std::map> batchOutputs; + std::map> batchInputs; + std::map> batchOutputs; InferenceEngine::IStreamsExecutor* streamExecutor = nullptr; void prepare_input(const cldnn::primitive_id &inputName, InferenceEngine::Blob::Ptr &inputBlob, @@ -93,10 +93,10 @@ class InferRequestLegacy : public InferenceEngine::IInferRequestInternal { std::shared_ptr alloc = nullptr); InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout); - void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); + void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info_legacy* bi = nullptr); void copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob, - buf_info* bi = nullptr); + buf_info_legacy* bi = nullptr); InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem); void allocate_inputs(); diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index 48f194f5f71d51..adb829addfbf6c 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -48,8 +48,9 @@ void convertAndCopy(const InferenceEngine::Blob* src, dst_t* dst) { } template -void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, cldnn::stream& stream) { - size_t n = dst->size(); +void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info* bi, cldnn::stream& stream) { + size_t n = (bi == nullptr) ? dst->size() : bi->buf_size; + size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; auto layout = src->get_layout(); auto size = layout.get_tensor(); @@ -61,6 +62,7 @@ void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, cldnn::stream } cldnn::mem_lock src_lock{ src, stream }; src_dt* src_ptr = src_lock.data(); + dst_ptr += offset; if (layout.data_padding) { for (size_t b = 0; b < size.batch[0]; b++) { @@ -341,13 +343,12 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) { IE_THROW(NotImplemented) << cannot_set_compound; } if (isDynamic) { - // TODO(Andrew): Enable below to support dynamic batch // extract new batch size from blob - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // const auto batch_idx = m_graph->GetInputDynBatchDims()[name].first; - // if (batch_idx >= 0) - // SetBatch(blobDesc.getDims()[batch_idx]); - // } + if (m_graph->GetMaxDynamicBatchSize() > 1) { + const auto batch_idx = m_graph->GetInputDynBatchDims()[name].first; + if (batch_idx >= 0) + SetBatch(blobDesc.getDims()[batch_idx]); + } // We must create new input data if it has never been allocated or previously allocated // device blob is smaller than currently assigned user blob bool needs_realloc = _deviceInputs.find(name) == _deviceInputs.end() || _deviceInputs.at(name)->byteSize() < data->byteSize(); @@ -522,92 +523,91 @@ void InferRequest::SetGraph(std::shared_ptr graph) { IE_THROW(NetworkNotLoaded); } - // TODO(Andrew): Enable below to support dynamic batch - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // SetBatch(m_graph->GetMaxDynamicBatchSize()); - // allocate_inputs_dynamic(); - // allocate_outputs_dynamic(); - // } - if (!m_graph->GetNetwork()->is_dynamic()) { - allocate_inputs(); - allocate_outputs(); + if (m_graph->GetMaxDynamicBatchSize() > 1) { + SetBatch(m_graph->GetMaxDynamicBatchSize()); + allocate_inputs_dynamic(); + allocate_outputs_dynamic(); + } else { + if (!m_graph->GetNetwork()->is_dynamic()) { + allocate_inputs(); + allocate_outputs(); + } } } -// TODO(Andrew): Enable below to support dynamic batch -// void InferRequest::SetBatch(int new_batch) { -// OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::SetBatch"); -// if (m_graph->GetMaxDynamicBatchSize() < 0) -// IE_THROW() << "Dynamic batch is not enabled."; +void InferRequest::SetBatch(int new_batch) { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::SetBatch"); + if (m_graph->GetMaxDynamicBatchSize() < 0) + IE_THROW() << "Dynamic batch is not enabled."; -// if (new_batch < 1 || new_batch > m_graph->GetMaxDynamicBatchSize()) { -// IE_THROW() << "Invalid dynamic batch size " << new_batch << -// " for this request. Got: " << new_batch << ". Expected value in range [1;" << m_graph->GetMaxDynamicBatchSize() << "]"; -// } + if (new_batch < 1 || new_batch > m_graph->GetMaxDynamicBatchSize()) { + IE_THROW() << "Invalid dynamic batch size " << new_batch << + " for this request. Got: " << new_batch << ". Expected value in range [1;" << m_graph->GetMaxDynamicBatchSize() << "]"; + } -// if (new_batch == m_curBatch) -// return; + if (new_batch == m_curBatch) + return; -// batchInputs.clear(); -// batchOutputs.clear(); + batchInputs.clear(); + batchOutputs.clear(); -// // tune expected inputs -// for (auto& input : m_graph->GetNetworkInputs()) { -// auto sz = input.second->getTensorDesc().getDims(); -// const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first; -// if (batch_idx >= 0) -// sz[batch_idx] = 1; + // tune expected inputs + for (auto& input : m_graph->GetNetworkInputs()) { + auto sz = input.second->getTensorDesc().getDims(); + const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first; + if (batch_idx >= 0) + sz[batch_idx] = 1; -// size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); -// std::vector in_buf; + size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); + std::vector in_buf; -// size_t offset = 0; -// size_t bsz = single_batch; + size_t offset = 0; + size_t bsz = single_batch; -// // calculate metadata for input buffers -// for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) { -// unsigned int mask = 1 << nb; + // calculate metadata for input buffers + for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) { + unsigned int mask = 1 << nb; -// buf_info ib = { offset, bsz }; -// in_buf.push_back(ib); + buf_info ib = { offset, bsz }; + in_buf.push_back(ib); -// if (new_batch & mask) -// offset += bsz; -// bsz <<= 1; -// } + if (new_batch & mask) + offset += bsz; + bsz <<= 1; + } -// batchInputs[input.first] = in_buf; -// } + batchInputs[input.first] = in_buf; + } -// // tune expected outputs -// for (auto& no : m_graph->GetNetworkOutputs()) { -// auto sz = no.second->getTensorDesc().getDims(); -// const auto batch_idx = m_graph->GetInputDynBatchDims()[no.first].first; -// if (batch_idx >= 0) -// sz[batch_idx] = 1; -// size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); -// std::vector out_buf; + // tune expected outputs + for (auto& no : m_graph->GetNetworkOutputs()) { + auto sz = no.second->getTensorDesc().getDims(); + const auto batch_idx = m_graph->GetInputDynBatchDims()[no.first].first; + if (batch_idx >= 0) + sz[batch_idx] = 1; + size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); + std::vector out_buf; -// size_t offset = 0; -// size_t bsz = single_batch; -// // calculate metadata for output buffers -// for (uint32_t nb = 0; nb < m_graph->GetNetworksCount(); nb++) { -// uint32_t mask = 1 << nb; + size_t offset = 0; + size_t bsz = single_batch; + // calculate metadata for output buffers + for (uint32_t nb = 0; nb < m_graph->GetNetworksCount(); nb++) { + uint32_t mask = 1 << nb; -// buf_info ob = { offset, bsz }; -// out_buf.push_back(ob); + buf_info ob = { offset, bsz }; + out_buf.push_back(ob); -// if (new_batch & mask) -// offset += bsz; + if (new_batch & mask) + offset += bsz; -// bsz <<= 1; -// } + bsz <<= 1; + } -// batchOutputs[no.first] = out_buf; -// } + batchOutputs[no.first] = out_buf; + } -// m_curBatch = new_batch; -// } + m_curBatch = new_batch; +} InferRequest::InferRequest(InputsDataMap networkInputs, OutputsDataMap networkOutputs, const CompiledModel::Ptr& execNetwork) @@ -629,22 +629,20 @@ InferRequest::InferRequest(const std::vector>& i // ----------------------------------------------------------------------------------------- // void InferRequest::preprocess_notify() { m_graph->wait(Graph::Stage::PREPROC); - // TODO(Andrew): Enable below to support dynamic batch - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // preprocess_dynamic(); - // } else { - // execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP - // } + if (m_graph->GetMaxDynamicBatchSize() > 1) { + preprocess_dynamic(); + } else { + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP + } m_graph->notify(Graph::Stage::PREPROC); } void InferRequest::preprocess() { - // TODO(Andrew): Enable below to support dynamic batch - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // preprocess_dynamic(); - // } else { - // execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP - // } + if (m_graph->GetMaxDynamicBatchSize() > 1) { + preprocess_dynamic(); + } else { + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP + } } void InferRequest::enqueue_notify() { @@ -653,11 +651,10 @@ void InferRequest::enqueue_notify() { } void InferRequest::enqueue() { - // TODO(Andrew): Enable below to support dynamic batch - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // enqueue_dynamic(); - // return; - // } + if (m_graph->GetMaxDynamicBatchSize() > 1) { + enqueue_dynamic(); + return; + } // set input and output memory from request blob maps // into the network object primitives @@ -770,11 +767,10 @@ void InferRequest::wait_notify() { } void InferRequest::wait() { - // TODO(Andrew): Enable below to support dynamic batch - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // wait_dynamic(); - // return; - // } + if (m_graph->GetMaxDynamicBatchSize() > 1) { + wait_dynamic(); + return; + } if (internal_outputs.empty()) { IE_THROW() << "Inference was not started!\n"; @@ -838,55 +834,55 @@ void InferRequest::wait() { m_graph->UpdatePerfStatistics(); } } -// TODO(Andrew): Enable below to support dynamic batch -// void InferRequest::preprocess_dynamic() { -// // execute input pre-processing. -// execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP -// } -// TODO(Andrew): Enable below to support dynamic batch -// void InferRequest::enqueue_dynamic() { -// internal_outputs_dynamic.clear(); -// auto numNets = m_graph->GetNetworksCount(); -// internal_outputs_dynamic.resize(numNets); - -// // set up exection and put all graphs into driver queue -// for (unsigned nb = 0; nb < numNets; nb++) { -// unsigned int mask = 1 << nb; - -// if (m_curBatch & mask) { -// for (auto& item : _inputs) { -// const cldnn::primitive_id& inputName = item.first; -// const Blob::Ptr inputBlob = item.second; - -// auto inputLayout = m_graph->GetInputLayouts().at(inputName); -// inputLayout.size.batch[0] = mask; -// copy_input_data(m_graph->GetNetwork(nb), inputName, inputLayout, *inputBlob, &batchInputs[inputName][nb]); -// } -// internal_outputs_dynamic[nb] = m_graph->GetNetwork(nb)->execute(); -// } -// } -// } -// TODO(Andrew): Enable below to support dynamic batch -// void InferRequest::wait_dynamic() { -// if (internal_outputs_dynamic.empty()) { -// IE_THROW() << "Inference was not started!\n"; -// } - -// // now try to get execution results -// for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) { -// unsigned int mask = 1 << nb; - -// if (m_curBatch & mask) { -// for (auto& no : _networkOutputs) { -// std::string outputID = outputsMap.at(no.first); -// auto outputMemory = internal_outputs_dynamic[nb].at(outputID).get_memory(); -// Blob::Ptr bptr = _outputs[no.first]; - -// copy_output_data(outputMemory, bptr, &batchOutputs[no.first][nb]); -// } -// } -// } -// } + +void InferRequest::preprocess_dynamic() { + // execute input pre-processing. + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP +} + +void InferRequest::enqueue_dynamic() { + internal_outputs_dynamic.clear(); + auto numNets = m_graph->GetNetworksCount(); + internal_outputs_dynamic.resize(numNets); + + // set up exection and put all graphs into driver queue + for (unsigned nb = 0; nb < numNets; nb++) { + unsigned int mask = 1 << nb; + + if (m_curBatch & mask) { + for (auto& item : _inputs) { + const cldnn::primitive_id& inputName = item.first; + const Blob::Ptr inputBlob = item.second; + + auto inputLayout = m_graph->GetInputLayouts().at(inputName); + inputLayout.size[0] = mask; // batch idx of dims is 0 + copy_input_data(m_graph->GetNetwork(nb), inputName, inputLayout, *inputBlob, &batchInputs[inputName][nb]); + } + internal_outputs_dynamic[nb] = m_graph->GetNetwork(nb)->execute(); + } + } +} + +void InferRequest::wait_dynamic() { + if (internal_outputs_dynamic.empty()) { + IE_THROW() << "Inference was not started!\n"; + } + + // now try to get execution results + for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) { + unsigned int mask = 1 << nb; + + if (m_curBatch & mask) { + for (auto& no : _networkOutputs) { + std::string outputID = outputsMap.at(no.first); + auto outputMemory = internal_outputs_dynamic[nb].at(outputID).get_memory(); + Blob::Ptr bptr = _outputs[no.first]; + + copy_output_data(outputMemory, bptr, &batchOutputs[no.first][nb]); + } + } + } +} // ----------------------------------------------------------------------------------------- // // ---------------------------- internal utils --------- ----------------------------------- // @@ -900,22 +896,21 @@ void InferRequest::setup_stream_graph() { streamID = streamID % numGraphs; } m_graph = streamGraphs[streamID]; - // TODO(Andrew): Enable below to support dynamic batch // in case of dynamic batch, check all input blobs and set new batch - // if (m_graph->GetMaxDynamicBatchSize() > 1) { - // for (auto& input : _networkInputs) { - // auto node = findInputByNodeName(input.first); - // bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic()); - // if (!is_dynamic) - // continue; - // // extract new batch size from blob - // const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first; - // if (batch_idx >= 0) { - // SetBatch(_inputs[input.first]->getTensorDesc().getDims()[batch_idx]); - // break; - // } - // } - // } + if (m_graph->GetMaxDynamicBatchSize() > 1) { + for (auto& input : _networkInputs) { + auto node = findInputByNodeName(input.first); + bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic()); + if (!is_dynamic) + continue; + // extract new batch size from blob + const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first; + if (batch_idx >= 0) { + SetBatch(_inputs[input.first]->getTensorDesc().getDims()[batch_idx]); + break; + } + } + } } Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr alloc) { @@ -941,21 +936,21 @@ Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorD return blob; } -void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) { +void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::copy_output_data"); auto& stream = m_graph->GetNetwork()->get_stream(); switch (dst->getTensorDesc().getPrecision()) { - case Precision::FP64: copyResultToOutputBlob(src, dst, stream); break; - case Precision::FP32: copyResultToOutputBlob(src, dst, stream); break; - case Precision::FP16: copyResultToOutputBlob(src, dst, stream); break; - case Precision::I64: copyResultToOutputBlob(src, dst, stream); break; - case Precision::I32: copyResultToOutputBlob(src, dst, stream); break; - case Precision::I16: copyResultToOutputBlob(src, dst, stream); break; - case Precision::I8: copyResultToOutputBlob(src, dst, stream); break; - case Precision::U16: copyResultToOutputBlob(src, dst, stream); break; - case Precision::U32: copyResultToOutputBlob(src, dst, stream); break; - case Precision::U64: copyResultToOutputBlob(src, dst, stream); break; - case Precision::U8: copyResultToOutputBlob(src, dst, stream); break; + case Precision::FP64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::FP32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::FP16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I8: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U8: copyResultToOutputBlob(src, dst, bi, stream); break; default: IE_THROW(NotImplemented) << "The plugin does not support output " << dst->getTensorDesc().getPrecision() << " precision"; } } @@ -963,44 +958,46 @@ void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) { void InferRequest::copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, - const Blob &inputBlob) { + const Blob &inputBlob, buf_info* bi) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::copy_input_data"); + size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; + cldnn::primitive_id internalName = "parameter:" + inputName; auto locked = inputBlob.cbuffer(); switch (inputBlob.getTensorDesc().getPrecision()) { case Precision::FP32: { - float* blob_ptr = const_cast(locked.as()); + float* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::I32: { - int32_t* blob_ptr = const_cast(locked.as()); + int32_t* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::I64: { - int64_t* blob_ptr = const_cast(locked.as()); + int64_t* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::FP16: { - uint16_t* blob_ptr = const_cast(locked.as()); + uint16_t* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::I8: { - int8_t* blob_ptr = const_cast(locked.as()); + int8_t* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::U8: { - uint8_t* blob_ptr = const_cast(locked.as()); + uint8_t* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::BOOL: { - uint8_t* blob_ptr = const_cast(locked.as()); + uint8_t* blob_ptr = const_cast(locked.as()) + offset; network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } @@ -1071,24 +1068,24 @@ void InferRequest::allocate_inputs() { } } } -// TODO(Andrew): Enable below to support dynamic batch -// void InferRequest::allocate_inputs_dynamic() { -// OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs_dynamic"); -// // allocate inputs -// for (auto &input : m_graph->GetNetworkInputs()) { -// InputInfo::Ptr ni = _networkInputs.at(input.first); -// TensorDesc desc = input.second->getTensorDesc(); - -// Blob::Ptr inputBlob = create_host_blob(desc); -// if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { -// desc.setPrecision(Precision::FP32); -// auto fp32inputBlob = InferenceEngine::make_shared_blob(desc); -// fp32inputBlob->allocate(); -// _inputs[input.first + fp32_suffix] = fp32inputBlob; -// } -// _inputs[input.first] = inputBlob; -// } -// } + +void InferRequest::allocate_inputs_dynamic() { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs_dynamic"); + // allocate inputs + for (auto &input : m_graph->GetNetworkInputs()) { + InputInfo::Ptr ni = _networkInputs.at(input.first); + TensorDesc desc = input.second->getTensorDesc(); + + Blob::Ptr inputBlob = create_host_blob(desc); + if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { + desc.setPrecision(Precision::FP32); + auto fp32inputBlob = InferenceEngine::make_shared_blob(desc); + fp32inputBlob->allocate(); + _inputs[input.first + fp32_suffix] = fp32inputBlob; + } + _inputs[input.first] = inputBlob; + } +} void InferRequest::allocate_outputs() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs"); @@ -1136,20 +1133,20 @@ void InferRequest::allocate_outputs() { } } } -// TODO(Andrew): Enable below to support dynamic batch -// void InferRequest::allocate_outputs_dynamic() { -// OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs_dynamic"); -// // allocate outputs -// for (auto& no : m_graph->GetNetworkOutputs()) { -// std::string outputID = m_graph->MapOutputName(no.first); -// DataPtr oi = no.second; -// TensorDesc desc = oi->getTensorDesc(); - -// Blob::Ptr outputBlob = create_host_blob(desc); -// _outputs[no.first] = outputBlob; -// outputsMap[no.first] = outputID; -// } -// } + +void InferRequest::allocate_outputs_dynamic() { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs_dynamic"); + // allocate outputs + for (auto& no : m_graph->GetNetworkOutputs()) { + std::string outputID = m_graph->MapOutputName(no.first); + DataPtr oi = no.second; + TensorDesc desc = oi->getTensorDesc(); + + Blob::Ptr outputBlob = create_host_blob(desc); + _outputs[no.first] = outputBlob; + outputsMap[no.first] = outputID; + } +} void InferRequest::InferImpl() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::InferImpl"); diff --git a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp index 249589ec644ec8..069b147381d79a 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp @@ -48,7 +48,7 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) { } template -void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info* bi, cldnn::stream& stream) { +void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info_legacy* bi, cldnn::stream& stream) { size_t n = (bi == nullptr) ? dst->size() : bi->buf_size; size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; @@ -549,7 +549,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { sz[batch_idx] = 1; size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); - std::vector in_buf; + std::vector in_buf; size_t offset = 0; size_t bsz = single_batch; @@ -558,7 +558,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) { unsigned int mask = 1 << nb; - buf_info ib = { offset, bsz }; + buf_info_legacy ib = { offset, bsz }; in_buf.push_back(ib); if (new_batch & mask) @@ -576,7 +576,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { if (batch_idx >= 0) sz[batch_idx] = 1; size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); - std::vector out_buf; + std::vector out_buf; size_t offset = 0; size_t bsz = single_batch; @@ -584,7 +584,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { for (uint32_t nb = 0; nb < m_graph->GetNetworksCount(); nb++) { uint32_t mask = 1 << nb; - buf_info ob = { offset, bsz }; + buf_info_legacy ob = { offset, bsz }; out_buf.push_back(ob); if (new_batch & mask) @@ -894,7 +894,7 @@ Blob::Ptr InferRequestLegacy::create_shared_device_blob(const InferenceEngine::T return blob; } -void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) { +void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info_legacy* bi) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequestLegacy::copy_output_data"); auto& stream = m_graph->GetNetwork()->get_stream(); switch (dst->getTensorDesc().getPrecision()) { @@ -916,7 +916,7 @@ void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, void InferRequestLegacy::copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, - const Blob &inputBlob, buf_info* bi) { + const Blob &inputBlob, buf_info_legacy* bi) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequestLegacy::copy_input_data"); size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; diff --git a/src/plugins/intel_gpu/src/plugin/program.cpp b/src/plugins/intel_gpu/src/plugin/program.cpp index f2e37339537f0b..6ff9e179a93469 100644 --- a/src/plugins/intel_gpu/src/plugin/program.cpp +++ b/src/plugins/intel_gpu/src/plugin/program.cpp @@ -156,6 +156,16 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptrget_output_partial_shape(0); batch_dim[pname].first = 0; batch_dim[pname].second = m_config.max_dynamic_batch; + } else { + dyn_shape_batch_found = IsDynBatchModel(func, shapes, batch_dim); + if (dyn_shape_batch_found) { + m_config.max_dynamic_batch = batch_dim.begin()->second.second; + } else { + if (!batch_dim.empty() && shapes.empty()) { + // more than on dynamic dim or dynamic rank + IE_THROW() << "Only dynamic batch is supported!"; + } + } } int m_bv_sz = GetMaxBatchSizeForSingleProgram();