diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/async_infer_request_legacy.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/async_infer_request_legacy.hpp new file mode 100644 index 00000000000000..90731d9bed629f --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/async_infer_request_legacy.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include "intel_gpu/plugin/infer_request_legacy.hpp" + +namespace ov { +namespace runtime { +namespace intel_gpu { + +class AsyncInferRequestLegacy : public InferenceEngine::AsyncInferRequestThreadSafeDefault { +public: + using Parent = InferenceEngine::AsyncInferRequestThreadSafeDefault; + AsyncInferRequestLegacy(const InferRequestLegacy::Ptr &inferRequest, + const InferenceEngine::ITaskExecutor::Ptr& taskExecutor, + const InferenceEngine::ITaskExecutor::Ptr& waitExecutor, + const InferenceEngine::ITaskExecutor::Ptr& callbackExecutor); + + ~AsyncInferRequestLegacy(); + + void Infer_ThreadUnsafe() override; + void StartAsync_ThreadUnsafe() override; + +private: + InferRequestLegacy::Ptr _inferRequest; + InferenceEngine::ITaskExecutor::Ptr _waitExecutor; +}; + +} // namespace intel_gpu +} // namespace runtime +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp index 77d3667262d376..bc0bf027ac2c6a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp @@ -78,7 +78,6 @@ class InferRequest : public InferenceEngine::IInferRequestInternal { bool m_useProfiling = false; bool m_useStreams = false; bool m_useExternalQueue = false; - bool is_allocated = false; std::shared_ptr m_graph; // dynamic batch stuff @@ -92,19 +91,19 @@ class InferRequest : public InferenceEngine::IInferRequestInternal { InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, std::shared_ptr alloc = nullptr); - InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout); + InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc); + void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); void copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob, buf_info* bi = nullptr); - InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem); void allocate_inputs(); void allocate_outputs(); + void allocate_inputs_dynamic(); + void allocate_outputs_dynamic(); - void set_input(const std::string& name, const InferenceEngine::Blob::Ptr& data); - void set_output(const std::string& name, const InferenceEngine::Blob::Ptr& data); InferenceEngine::Blob::Ptr reinterpret_device_blob(InferenceEngine::Blob::Ptr data, const InferenceEngine::TensorDesc& new_desc); std::map internal_outputs; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp index 50c282ea91f1f1..2ed40712b8e3e7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp @@ -80,8 +80,8 @@ class InferRequestLegacy : public InferenceEngine::IInferRequestInternal { std::shared_ptr m_graph; // dynamic batch stuff - std::map> batchInputs; - std::map> batchOutputs; + std::map> batchInputs; + std::map> batchOutputs; InferenceEngine::IStreamsExecutor* streamExecutor = nullptr; void prepare_input(const cldnn::primitive_id &inputName, InferenceEngine::Blob::Ptr &inputBlob, @@ -92,10 +92,10 @@ class InferRequestLegacy : public InferenceEngine::IInferRequestInternal { std::shared_ptr alloc = nullptr); InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout); - void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); + void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info_legacy* bi = nullptr); void copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob, - buf_info* bi = nullptr); + buf_info_legacy* bi = nullptr); InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem); void allocate_inputs(); diff --git a/src/plugins/intel_gpu/src/graph/eltwise.cpp b/src/plugins/intel_gpu/src/graph/eltwise.cpp index 6c5a7ba8372f44..5c36e059d462e8 100644 --- a/src/plugins/intel_gpu/src/graph/eltwise.cpp +++ b/src/plugins/intel_gpu/src/graph/eltwise.cpp @@ -31,23 +31,39 @@ layout eltwise_inst::calc_output_layout(eltwise_node const& node, kernel_impl_pa auto desc = impl_param.typed_desc(); auto output_type = desc->output_data_type ? *desc->output_data_type : input_node_layout.data_type; - ov::PartialShape out_pshape; - auto format = input_node_layout.format; - for (size_t i = 0; i < desc->input_size(); i++) { - if (i == primary_input_idx) - continue; + auto get_output_layout = [&](){ + auto format = input_node_layout.format; + if (input_node_layout.is_static()) { + auto size = input_node_layout.get_tensor(); + for (size_t i = 0; i < node.inputs_count(); i++) { + if (i == primary_input_idx) + continue; - auto l = impl_param.get_non_padded_input_layout(i); - if (!ov::PartialShape::broadcast_merge_into(out_pshape, l.size, ov::op::AutoBroadcastSpec(ov::op::AutoBroadcastType::NUMPY))) { - IE_THROW() << "incorrect input shapes\n"; + auto l = node.input(i).get_non_padded_output_layout(); + size = tensor::max(size, l.get_tensor()); + if (l.format == format::b_fs_zyx_fsv16) // use optimized 5D + format = format::b_fs_zyx_fsv16; + else if (l.format == format::bs_fs_zyx_bsv16_fsv16) + format = format::bs_fs_zyx_bsv16_fsv16; + } + return layout(output_type, format, size); + } else { + ov::PartialShape out_pshape; + for (size_t i = 0; i < node.inputs_count(); i++) { + auto l = node.input(i).get_non_padded_output_layout(); + if (!ov::PartialShape::broadcast_merge_into(out_pshape, l.size, ov::op::AutoBroadcastSpec(ov::op::AutoBroadcastType::NUMPY))) { + IE_THROW() << "incorrect input shapes\n"; + } + if (l.format == format::b_fs_zyx_fsv16) // use optimized 5D + format = format::b_fs_zyx_fsv16; + else if (l.format == format::bs_fs_zyx_bsv16_fsv16) + format = format::bs_fs_zyx_bsv16_fsv16; + } + return layout(output_type, format, out_pshape); } + }; - if (l.format == format::b_fs_zyx_fsv16) // use optimized 5D - format = format::b_fs_zyx_fsv16; - else if (l.format == format::bs_fs_zyx_bsv16_fsv16) - format = format::bs_fs_zyx_bsv16_fsv16; - } - auto output_layout = layout(output_type, format, out_pshape); + auto output_layout = get_output_layout(); auto mode = desc->mode; // list of operations supported for integer types diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp index d5ff619b204bfe..8e578ee557dfe2 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp @@ -11,6 +11,8 @@ #include "gemm/gemm_kernel_base.h" #include "intel_gpu/runtime/error_handler.hpp" +#include "matmul_shape_inference.hpp" + namespace cldnn { namespace ocl { @@ -29,8 +31,61 @@ struct gemm_impl : typed_primitive_impl_ocl { auto gemm_optional_params = get_default_optional_params(arg.get_program()); - for (size_t i = 1; i < arg.inputs_count(); i++) { - gemm_params.inputs.push_back(convert_data_tensor(impl_param->input_layouts[i])); + auto gemmSpecificPartialShape = [](ov::PartialShape& pshape) { + switch (pshape.rank().get_length()) { + case 2: { // batch, feature representation (rank == 2) + pshape.insert(pshape.begin(), 1ul); + pshape.insert(pshape.begin(), 1ul); + break; + } + case 3 : { // feature representation (rank == 3) + pshape.insert(pshape.begin(), 1, 1ul); + break; + } + } + }; + auto output_layout = arg.get_output_layout(); + auto output_pshape = output_layout.size; + auto output_rank = output_pshape.rank().get_length(); + std::vector input_shapes; + for (size_t i = 0; i < arg.inputs_count(); i++) { + auto input_layout = arg.input(i).get_output_layout(); + auto input_pshape = input_layout.get_partial_shape(); + auto input_rank = input_pshape.rank().get_length(); + if (input_rank != output_rank || input_rank < 4) { + if (input_rank == 1) { + bool transpose = false; + if (i == 0) { + transpose = arg.get_primitive()->transpose_input0; + input_pshape.insert(input_pshape.begin(), 1); + } else { + transpose = arg.get_primitive()->transpose_input1; + input_pshape.insert(input_pshape.end(), 1); + } + if (transpose) { + std::swap(input_pshape[0], input_pshape[1]); + } + } + if (input_rank < output_rank) + input_pshape.insert(input_pshape.begin(), output_rank - input_rank, 1ul); + + gemmSpecificPartialShape(input_pshape); + } + input_layout.size = input_pshape; + input_shapes.push_back(input_pshape); + if (i == 0) + gemm_params.inputs[0] = convert_data_tensor(input_layout); + else + gemm_params.inputs.push_back(convert_data_tensor(input_layout)); + } + if (output_rank < 4) { + ov::op::v0::MatMul op; + op.set_transpose_a(arg.get_primitive()->transpose_input0); + op.set_transpose_b(arg.get_primitive()->transpose_input1); + std::vector output_shapes = {ov::PartialShape()}; + shape_infer(&op, input_shapes, output_shapes); + output_layout.size = output_shapes[0]; + gemm_params.outputs[0] = convert_data_tensor(output_layout); } gemm_params.alpha = desc->alpha; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/strided_slice.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/strided_slice.cpp index 299234e52a7c29..fa9bdf3da5a8f7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/strided_slice.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/strided_slice.cpp @@ -32,13 +32,47 @@ struct strided_slice_impl : typed_primitive_impl_ocl { auto op_params = get_default_optional_params(arg.get_program()); const size_t dims_num = params.inputs[0].Dimentions(); - // Getting data from constant inputs. There are 3 args: Begin, End, Stride - for (size_t i = 1; i < arg.get_dependencies().size(); ++i) { - auto& input = arg.get_dependency(i).as(); - auto mem = input.get_attached_memory_ptr(); - std::vector sizes = read_vector(mem, arg.get_program().get_stream()); - pad_vector_to_size(sizes, dims_num, i != 1); // for "begin" completion used 0 value, for other - 1 - params.striding_params.push_back(sizes); + if (!arg.const_mem.empty()) { + // Getting data from constant inputs. There are 3 args: Begin, End, Stride + for (size_t i = 0; i < arg.const_mem.size(); ++i) { + auto mem = arg.const_mem[i]; + std::vector sizes; + if (mem->get_layout().data_type == cldnn::data_types::i64) { + mem_lock lock{mem, arg.get_program().get_stream()}; + int64_t* data = lock.data(); + std::vector sizes_i64 = std::vector(data, data + mem->get_layout().count()); + sizes.resize(sizes_i64.size()); + for (size_t j = 0; j < sizes.size(); j++) + sizes[j] = static_cast(sizes_i64[j]); + } else { + mem_lock lock{mem, arg.get_program().get_stream()}; + int32_t* data = lock.data(); + sizes = std::vector(data, data + mem->get_layout().count()); + } + pad_vector_to_size(sizes, dims_num, i != 1); // for "begin" completion used 0 value, for other - 1 + params.striding_params.push_back(sizes); + } + } else { + // Getting data from constant inputs. There are 3 args: Begin, End, Stride + for (size_t i = 1; i < arg.get_dependencies().size(); ++i) { + auto& input = arg.get_dependency(i).as(); + auto mem = input.get_attached_memory_ptr(); + std::vector sizes; + if (input.get_output_layout().data_type == cldnn::data_types::i64) { + mem_lock lock{mem, arg.get_program().get_stream()}; + int64_t* data = lock.data(); + std::vector sizes_i64 = std::vector(data, data + input.get_output_layout().count()); + sizes.resize(sizes_i64.size()); + for (size_t j = 0; j < sizes.size(); j++) + sizes[j] = static_cast(sizes_i64[j]); + } else { + mem_lock lock{mem, arg.get_program().get_stream()}; + int32_t* data = lock.data(); + sizes = std::vector(data, data + input.get_output_layout().count()); + } + pad_vector_to_size(sizes, dims_num, i != 1); // for "begin" completion used 0 value, for other - 1 + params.striding_params.push_back(sizes); + } } auto begin_mask_ = prim->begin_mask; diff --git a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp index ba751bd632da35..71cceb90fddff5 100644 --- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp @@ -720,7 +720,7 @@ kernel_selector::dev_type get_device_type(cldnn::device_type type) { kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split, const tensor view_offset) { const auto& pad = l.data_padding; - const auto& vals = l.get_dims(); + const auto& vals = l.get_tensor().sizes(l.format); const auto& add_offsets = view_offset.sizes(l.format); const auto& lower_pad = pad.lower_size().sizes(l.format); const auto& upper_pad = pad.upper_size().sizes(l.format); diff --git a/src/plugins/intel_gpu/src/graph/reshape.cpp b/src/plugins/intel_gpu/src/graph/reshape.cpp index dda6ebab29471e..ce0e7f9d59fd23 100644 --- a/src/plugins/intel_gpu/src/graph/reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/reshape.cpp @@ -127,10 +127,15 @@ reshape_inst::typed_primitive_inst(network& network, reshape_node const& node) : // if reshape operated in-place, postpone creation of the output until network run, // then create new memory object as the reinterpreted output of the previous primitive - if (!node.can_be_optimized()) - _output = allocate_output(); - else - reuse_input(); + if (_node.get_output_layout().is_static()) { + if (!node.can_be_optimized()) + _output = allocate_output(); + else + reuse_input(); + } else { + if (_exec_deps.size() > 0 && input_memory_ptr()) + reuse_input(); + } } static std::vector read_vector(cldnn::memory::ptr mem, cldnn::stream& stream) { diff --git a/src/plugins/intel_gpu/src/plugin/async_infer_request_legacy.cpp b/src/plugins/intel_gpu/src/plugin/async_infer_request_legacy.cpp new file mode 100644 index 00000000000000..adf4b29307944e --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/async_infer_request_legacy.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/plugin/async_infer_request_legacy.hpp" +#include "intel_gpu/plugin/itt.hpp" +#include + +namespace ov { +namespace runtime { +namespace intel_gpu { + +AsyncInferRequestLegacy::AsyncInferRequestLegacy(const InferRequestLegacy::Ptr &inferRequest, + const InferenceEngine::ITaskExecutor::Ptr& taskExecutor, + const InferenceEngine::ITaskExecutor::Ptr& waitExecutor, + const InferenceEngine::ITaskExecutor::Ptr& callbackExecutor) + : AsyncInferRequestThreadSafeDefault(inferRequest, taskExecutor, callbackExecutor), _inferRequest(inferRequest), _waitExecutor(waitExecutor) { + _pipeline = {}; + + if (!_inferRequest->use_external_queue()) { + _pipeline.push_back({taskExecutor, + [this] { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "AsyncInferRequestLegacy::PreprocessingAndStartPipeline"); + _inferRequest->setup_stream_graph(); + _inferRequest->preprocess(); + _inferRequest->enqueue(); + _inferRequest->wait(); + } }); + } else { + _pipeline.push_back({ _waitExecutor, + [this] { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "AsyncInferRequestLegacy::WaitPipeline"); + _inferRequest->wait_notify(); + } }); + } +} + +void AsyncInferRequestLegacy::Infer_ThreadUnsafe() { + if (_inferRequest->use_external_queue()) { + _inferRequest->setup_stream_graph(); + _inferRequest->preprocess_notify(); + _inferRequest->enqueue_notify(); + } + Parent::Infer_ThreadUnsafe(); +} + +void AsyncInferRequestLegacy::StartAsync_ThreadUnsafe() { + if (_inferRequest->use_external_queue()) { + _inferRequest->setup_stream_graph(); + _inferRequest->preprocess_notify(); + _inferRequest->enqueue_notify(); + } + Parent::StartAsync_ThreadUnsafe(); +} + +AsyncInferRequestLegacy::~AsyncInferRequestLegacy() { + StopAndWait(); +} + +} // namespace intel_gpu +} // namespace runtime +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index 1af9a68fd790e3..b76e50eef748ad 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -8,6 +8,7 @@ #include "intel_gpu/plugin/infer_request.hpp" #include "intel_gpu/plugin/compiled_model.hpp" #include "intel_gpu/plugin/async_infer_request.hpp" +#include "intel_gpu/plugin/async_infer_request_legacy.hpp" #include "openvino/runtime/intel_gpu/properties.hpp" #include "intel_gpu/plugin/infer_request_legacy.hpp" @@ -121,8 +122,14 @@ IInferRequestInternal::Ptr CompiledModel::CreateInferRequest() { if (this->_plugin && _plugin->IsNewAPI()) { internalRequest = CreateInferRequestImpl(_parameters, _results); } - if (!internalRequest) + if (!internalRequest) { internalRequest = CreateInferRequestImpl(_networkInputs, _networkOutputs); + internalRequest->setPointerToExecutableNetworkInternal(shared_from_this()); + return std::make_shared(std::static_pointer_cast(internalRequest), + m_taskExecutor, + m_waitExecutor, + _callbackExecutor); + } internalRequest->setPointerToExecutableNetworkInternal(shared_from_this()); return std::make_shared(std::static_pointer_cast(internalRequest), m_taskExecutor, diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index afb537f7555666..88997966d6bae9 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -52,7 +52,7 @@ void convertAndCopy(const InferenceEngine::Blob* src, dst_t* dst) { } template -void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::intel_gpu::buf_info* bi, cldnn::stream& stream) { +void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info* bi, cldnn::stream& stream) { size_t n = (bi == nullptr) ? dst->size() : bi->buf_size; size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; @@ -234,6 +234,8 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) { if (inputTensorsMap.find(name) != inputTensorsMap.end()) { inputTensorsMap.erase(name); } + const bool compoundBlobPassed = data->is(); + InputInfo::Ptr foundInput; DataPtr foundOutput; auto blobDesc = data->getTensorDesc(); @@ -252,66 +254,149 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) { size_t netReqBinSize = std::accumulate(desc.getDims().begin(), desc.getDims().end(), desc.getPrecision().size(), std::multiplies()); + bool preProcResize = false; auto node = is_input ? findInputByNodeName(name) : findOutputByNodeName(name); bool isDynamic = node && node->get_output_partial_shape(0).is_dynamic(); + if (is_input) { + preProcResize = foundInput->getPreProcess().getResizeAlgorithm() != ResizeAlgorithm::NO_RESIZE; + const auto inputColorFormat = foundInput->getPreProcess().getColorFormat(); + preProcResize |= (inputColorFormat != ColorFormat::RAW) && (inputColorFormat != ColorFormat::BGR); + } - if (!isDynamic && dataBinSize != netReqBinSize) { + if (!isDynamic && + dataBinSize != netReqBinSize && !compoundBlobPassed && !preProcResize) { IE_THROW() << "Incorrect binary data size for " << (is_input ? "input" : "output") << " blob with name: \'" << name << "\' " << "Current: " << dataBinSize << " Required: " << netReqBinSize; } - if (is_input) { - set_input(name, data); - } else { - set_output(name, data); - } -} - -void InferRequest::set_input(const std::string& name, const Blob::Ptr& data) { auto remote_ptr = data->as(); bool is_remote = remote_ptr != nullptr; - - auto node = findInputByNodeName(name); - bool isDynamic = node && node->get_output_partial_shape(0).is_dynamic(); - if (is_remote) { - _deviceInputs[name] = data; - _inputs[name] = data; - } else { - if (data->buffer() == nullptr) - IE_THROW(NotAllocated) << str_input_not_allocated << " Input name: \'" << name << "\'"; - _inputs[name] = data; - if (isDynamic) { - // We must create new input data if it has never been allocated or previously allocated - // device blob is smaller than currently assigned user blob - bool needs_realloc = _deviceInputs.find(name) == _deviceInputs.end() || _deviceInputs.at(name)->byteSize() < data->byteSize(); - if (needs_realloc) { - _deviceInputs[name] = create_device_blob(data->getTensorDesc()); - } else { - if (_deviceInputs.at(name)->getTensorDesc() != data->getTensorDesc()) - _deviceInputs[name] = reinterpret_device_blob(_deviceInputs[name], data->getTensorDesc()); - } + auto impl = getBlobImpl(remote_ptr); + if (!impl->is_allocated()) { + impl->allocate(); } } -} + if (is_input) { + if (is_remote) { + _deviceInputs[name] = data; + _inputs[name] = data; + } else { + auto nv12_ptr = data->as(); + auto batched_ptr = data->as(); + bool is_batched = batched_ptr != nullptr; + bool is_nv12 = nv12_ptr != nullptr; + int expected_batch = is_batched ? desc.getDims()[0] : 1; + if (ColorFormat::NV12 == foundInput->getPreProcess().getColorFormat() && + m_graph->getConfig().nv12_two_inputs) { + // try extracting Y and UV remote blobs from it + // and put them into appropriate network inputs + // that should then go into biplanar NV12 reorder + + if (is_nv12 || is_batched) { + int num_blobs = is_batched ? batched_ptr->size() : 1; + for (auto i = 0; i < expected_batch; i++) { + std::string y_name = name + "_Y" + std::to_string(i); + std::string uv_name = name + "_UV" + std::to_string(i); + if (is_batched) { + int idx = i < num_blobs ? i : num_blobs-1; + nv12_ptr = getNV12BlobOrException(batched_ptr, idx); + } -void InferRequest::set_output(const std::string& name, const Blob::Ptr& data) { - auto remote_ptr = data->as(); - bool is_remote = remote_ptr != nullptr; + auto y_ptr = nv12_ptr->y()->as(); + if (y_ptr) { + auto y_impl = getBlobImpl(y_ptr); + if (!y_impl->is_allocated()) { + y_impl->allocate(); + } + _deviceInputs[y_name] = nv12_ptr->y(); + is_remote = true; + } - auto node = findOutputByNodeName(name); - bool isDynamic = node && node->get_output_partial_shape(0).is_dynamic(); + auto uv_ptr = nv12_ptr->uv()->as(); + if (uv_ptr) { + auto uv_impl = getBlobImpl(uv_ptr); + if (!uv_impl->is_allocated()) { + uv_impl->allocate(); + } + _deviceInputs[uv_name] = nv12_ptr->uv(); + is_remote = true; + } + } + } + } + if (is_remote) + _inputs[name] = data; + } - if (is_remote) { - _deviceOutputs[name] = data; + if (!is_remote) { + if (preProcessingRequired(foundInput, data)) { + // Stores the given blob as ROI blob. It will be used to fill in network input + // during pre-processing + if (_inputs[name]->is()) { + Blob::Ptr inputHostBlob = create_host_blob(desc); + _inputs[name] = inputHostBlob; + } + _preProcData[name] = CreatePreprocDataHelper(); + _preProcData[name]->isApplicable(data, _inputs[name]); + _preProcData[name]->setRoiBlob(data); + } else { + if (compoundBlobPassed) { + IE_THROW(NotImplemented) << cannot_set_compound; + } + if (isDynamic) { + // extract new batch size from blob + if (m_graph->GetMaxDynamicBatchSize() > 1) { + const auto batch_idx = m_graph->GetInputDynBatchDims()[name].first; + if (batch_idx >= 0) + SetBatch(blobDesc.getDims()[batch_idx]); + } + // We must create new input data if it has never been allocated or previously allocated + // device blob is smaller than currently assigned user blob + bool needs_realloc = _deviceInputs.find(name) == _deviceInputs.end() || _deviceInputs.at(name)->byteSize() < data->byteSize(); + if (needs_realloc) { + _deviceInputs[name] = create_device_blob(data->getTensorDesc()); + } else { + if (_deviceInputs.at(name)->getTensorDesc() != data->getTensorDesc()) + _deviceInputs[name] = reinterpret_device_blob(_deviceInputs[name], data->getTensorDesc()); + } + } else { + size_t blobSize = desc.getLayout() != SCALAR + ? details::product(desc.getDims()) + : 1; + if (dataSize != blobSize) { + IE_THROW() << "Input blob size is not equal to network input size (" + << dataSize << "!=" << blobSize << ")."; + } + } + if (data->buffer() == nullptr) + IE_THROW(NotAllocated) << str_input_not_allocated << " Input name: \'" << name << "\'"; + _inputs[name] = data; + } + } } else { - if (!isDynamic) { - if (data->buffer() == nullptr) - IE_THROW(NotAllocated) << str_output_not_allocated << " Output name: \'" << name << "\'"; + if (compoundBlobPassed) { + IE_THROW(NotImplemented) << cannot_set_compound; } + + if (is_remote) { + _deviceOutputs[name] = data; + } else { + if (!isDynamic) { + size_t outputSize = desc.getLayout() != SCALAR + ? details::product(desc.getDims()) + : 1; + if (dataSize != outputSize) { + IE_THROW() << "Output blob size is not equal to network output size (" << dataSize + << "!=" << outputSize << ")."; + } + if (data->buffer() == nullptr) + IE_THROW(NotAllocated) << str_output_not_allocated << " Output name: \'" << name << "\'"; + } + } + _outputs[name] = data; } - _outputs[name] = data; } void InferRequest::SetBlobs(const std::string& name, const std::vector& blobs) { @@ -371,7 +456,13 @@ void InferRequest::SetBlobs(const std::string& name, const std::vector()); - if (dataBinSize != netReqBinSize) { + bool preProcResize = false; + if (is_input) { + preProcResize = foundInput->getPreProcess().getResizeAlgorithm() != ResizeAlgorithm::NO_RESIZE; + const auto inputColorFormat = foundInput->getPreProcess().getColorFormat(); + preProcResize |= (inputColorFormat != ColorFormat::RAW) && (inputColorFormat != ColorFormat::BGR); + } + if (dataBinSize != netReqBinSize && !preProcResize) { IE_THROW() << "Incorrect binary data size for input blobs with name: \'" << name << "\' " << "Current: " << dataBinSize << " Required: " << netReqBinSize; } @@ -390,6 +481,14 @@ void InferRequest::SetBlobs(const std::string& name, const std::vectoras()); + if (!impl->is_allocated()) { + impl->allocate(); + } + } + } inputTensorsMap[name] = blobs; } @@ -539,10 +638,20 @@ InferRequest::InferRequest(const std::vector>& i // ----------------------------------------------------------------------------------------- // void InferRequest::preprocess_notify() { m_graph->wait(Graph::Stage::PREPROC); + if (m_graph->GetMaxDynamicBatchSize() > 1) { + preprocess_dynamic(); + } else { + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP + } m_graph->notify(Graph::Stage::PREPROC); } void InferRequest::preprocess() { + if (m_graph->GetMaxDynamicBatchSize() > 1) { + preprocess_dynamic(); + } else { + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP + } } void InferRequest::enqueue_notify() { @@ -551,6 +660,11 @@ void InferRequest::enqueue_notify() { } void InferRequest::enqueue() { + if (m_graph->GetMaxDynamicBatchSize() > 1) { + enqueue_dynamic(); + return; + } + // set input and output memory from request blob maps // into the network object primitives std::vector dependencies; @@ -670,6 +784,11 @@ void InferRequest::wait_notify() { } void InferRequest::wait() { + if (m_graph->GetMaxDynamicBatchSize() > 1) { + wait_dynamic(); + return; + } + if (internal_outputs.empty()) { IE_THROW() << "Inference was not started!\n"; } @@ -805,6 +924,21 @@ void InferRequest::setup_stream_graph() { streamID = streamID % numGraphs; } m_graph = streamGraphs[streamID]; + // in case of dynamic batch, check all input blobs and set new batch + if (m_graph->GetMaxDynamicBatchSize() > 1) { + for (auto& input : _networkInputs) { + auto node = findInputByNodeName(input.first); + bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic()); + if (!is_dynamic) + continue; + // extract new batch size from blob + const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first; + if (batch_idx >= 0) { + SetBatch(_inputs[input.first]->getTensorDesc().getDims()[batch_idx]); + break; + } + } + } } Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr alloc) { @@ -844,7 +978,7 @@ void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_i case Precision::I8: copyResultToOutputBlob(src, dst, bi, stream); break; case Precision::U16: copyResultToOutputBlob(src, dst, bi, stream); break; case Precision::U32: copyResultToOutputBlob(src, dst, bi, stream); break; - case Precision::U64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U64: copyResultToOutputBlob(src, dst, bi, stream); break; case Precision::U8: copyResultToOutputBlob(src, dst, bi, stream); break; default: IE_THROW(NotImplemented) << "The plugin does not support output " << dst->getTensorDesc().getPrecision() << " precision"; } @@ -910,7 +1044,8 @@ void InferRequest::allocate_inputs() { std::string name = ni.first; const TensorDesc& desc = ni.second->getTensorDesc(); - bool is_nv12_input = false; + bool is_nv12_input = ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() && + m_graph->getConfig().nv12_two_inputs; auto parameter = std::find_if(_parameters.begin(), _parameters.end(), [&](const std::shared_ptr& node) { return node->get_friendly_name() == name; @@ -963,6 +1098,24 @@ void InferRequest::allocate_inputs() { } } +void InferRequest::allocate_inputs_dynamic() { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs_dynamic"); + // allocate inputs + for (auto &input : m_graph->GetNetworkInputs()) { + InputInfo::Ptr ni = _networkInputs.at(input.first); + TensorDesc desc = input.second->getTensorDesc(); + + Blob::Ptr inputBlob = create_host_blob(desc); + if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { + desc.setPrecision(Precision::FP32); + auto fp32inputBlob = InferenceEngine::make_shared_blob(desc); + fp32inputBlob->allocate(); + _inputs[input.first + fp32_suffix] = fp32inputBlob; + } + _inputs[input.first] = inputBlob; + } +} + void InferRequest::allocate_outputs() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs"); // allocate outputs @@ -1010,6 +1163,20 @@ void InferRequest::allocate_outputs() { } } +void InferRequest::allocate_outputs_dynamic() { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs_dynamic"); + // allocate outputs + for (auto& no : m_graph->GetNetworkOutputs()) { + std::string outputID = m_graph->MapOutputName(no.first); + DataPtr oi = no.second; + TensorDesc desc = oi->getTensorDesc(); + + Blob::Ptr outputBlob = create_host_blob(desc); + _outputs[no.first] = outputBlob; + outputsMap[no.first] = outputID; + } +} + void InferRequest::InferImpl() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::InferImpl"); setup_stream_graph(); @@ -1109,19 +1276,22 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::Ptr& outputBlob) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::prepare_output"); - Blob::Ptr reqBlob = _deviceOutputs.at(outputName); - cldnn::primitive_id internalName = outputsMap[outputName]; - auto _nw_ptr = m_graph->GetNetwork(); auto remote_ptr = outputBlob->as(); - auto output_blob_ptr = (reqBlob != outputBlob && remote_ptr != nullptr) - ? remote_ptr - : reqBlob->as(); - auto impl = getBlobImpl(output_blob_ptr); - if (!impl->is_allocated()) { - IE_THROW(NotAllocated) << str_output_not_allocated; + bool is_remote = remote_ptr != nullptr; + if (is_remote) { + Blob::Ptr reqBlob = _deviceOutputs.at(outputName); + cldnn::primitive_id internalName = outputsMap[outputName]; + auto _nw_ptr = m_graph->GetNetwork(); + auto output_blob_ptr = (reqBlob != outputBlob && remote_ptr != nullptr) + ? remote_ptr + : reqBlob->as(); + auto impl = getBlobImpl(output_blob_ptr); + if (!impl->is_allocated()) { + IE_THROW(NotAllocated) << str_output_not_allocated; + } + auto outputMem = impl->getMemory(); + _nw_ptr->set_output_memory(internalName, outputMem); } - auto outputMem = impl->getMemory(); - _nw_ptr->set_output_memory(internalName, outputMem); } InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngine::TensorDesc& desc) { diff --git a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp index 249589ec644ec8..069b147381d79a 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp @@ -48,7 +48,7 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) { } template -void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info* bi, cldnn::stream& stream) { +void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info_legacy* bi, cldnn::stream& stream) { size_t n = (bi == nullptr) ? dst->size() : bi->buf_size; size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; @@ -549,7 +549,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { sz[batch_idx] = 1; size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); - std::vector in_buf; + std::vector in_buf; size_t offset = 0; size_t bsz = single_batch; @@ -558,7 +558,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) { unsigned int mask = 1 << nb; - buf_info ib = { offset, bsz }; + buf_info_legacy ib = { offset, bsz }; in_buf.push_back(ib); if (new_batch & mask) @@ -576,7 +576,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { if (batch_idx >= 0) sz[batch_idx] = 1; size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies()); - std::vector out_buf; + std::vector out_buf; size_t offset = 0; size_t bsz = single_batch; @@ -584,7 +584,7 @@ void InferRequestLegacy::SetBatch(int new_batch) { for (uint32_t nb = 0; nb < m_graph->GetNetworksCount(); nb++) { uint32_t mask = 1 << nb; - buf_info ob = { offset, bsz }; + buf_info_legacy ob = { offset, bsz }; out_buf.push_back(ob); if (new_batch & mask) @@ -894,7 +894,7 @@ Blob::Ptr InferRequestLegacy::create_shared_device_blob(const InferenceEngine::T return blob; } -void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) { +void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info_legacy* bi) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequestLegacy::copy_output_data"); auto& stream = m_graph->GetNetwork()->get_stream(); switch (dst->getTensorDesc().getPrecision()) { @@ -916,7 +916,7 @@ void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, void InferRequestLegacy::copy_input_data(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, - const Blob &inputBlob, buf_info* bi) { + const Blob &inputBlob, buf_info_legacy* bi) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequestLegacy::copy_input_data"); size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; diff --git a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp index e3300184a952fd..03646d38bcace0 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp @@ -154,7 +154,6 @@ static void CreateParameterOp(Program& p, const std::shared_ptr 1) { networkInputLayout.set_tensor({ 1, TensorValue(inputDims[3]), TensorValue(inputDims[2]), TensorValue(inputDims[1]) }); - std::vector inputs; for (size_t i = 0; i < inputDims[0].get_length(); ++i) { std::string batched_name = inputName + "_" + std::to_string(i); diff --git a/src/plugins/intel_gpu/src/plugin/program.cpp b/src/plugins/intel_gpu/src/plugin/program.cpp index 0a5dc2ddf394a7..a93f8db15412ef 100644 --- a/src/plugins/intel_gpu/src/plugin/program.cpp +++ b/src/plugins/intel_gpu/src/plugin/program.cpp @@ -155,6 +155,16 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptrget_output_partial_shape(0); batch_dim[pname].first = 0; batch_dim[pname].second = m_config.max_dynamic_batch; + } else { + dyn_shape_batch_found = IsDynBatchModel(func, shapes, batch_dim); + if (dyn_shape_batch_found) { + m_config.max_dynamic_batch = batch_dim.begin()->second.second; + } else { + if (!batch_dim.empty() && shapes.empty()) { + // more than on dynamic dim or dynamic rank + IE_THROW() << "Only dynamic batch is supported!"; + } + } } int m_bv_sz = GetMaxBatchSizeForSingleProgram();