From 9086a07547a478e8ef1fbfd52002d7d775fdfe7d Mon Sep 17 00:00:00 2001 From: Lyamin-Roman Date: Tue, 12 Mar 2024 06:16:04 +0900 Subject: [PATCH] [GPU] Remove use of legacy names --- .../include/intel_gpu/plugin/graph.hpp | 10 +- .../intel_gpu/plugin/program_builder.hpp | 13 +- .../intel_gpu/plugin/sync_infer_request.hpp | 28 +- src/plugins/intel_gpu/src/graph/network.cpp | 2 - src/plugins/intel_gpu/src/plugin/graph.cpp | 46 ++-- .../intel_gpu/src/plugin/ops/parameter.cpp | 8 +- .../intel_gpu/src/plugin/ops/result.cpp | 16 +- src/plugins/intel_gpu/src/plugin/plugin.cpp | 17 +- .../intel_gpu/src/plugin/program_builder.cpp | 15 +- .../src/plugin/sync_infer_request.cpp | 240 +++++++++--------- 10 files changed, 215 insertions(+), 180 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/graph.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/graph.hpp index 28bea9b80ff331..c843282aade33d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/graph.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/graph.hpp @@ -50,10 +50,11 @@ class Graph final { cldnn::engine& get_engine() const { return m_context->get_engine(); } const ExecutionConfig& get_config() const { return m_config; } - const std::map& get_input_layouts() const { return m_input_layouts; } + const std::map& get_input_layouts() const { return m_input_layouts; } std::shared_ptr get_network() const; - std::string out_name_to_internal(std::string out_port_name) const; + std::vector input_port_index_to_internal(size_t input_port_index) const; + std::string out_port_index_to_internal(size_t out_port_index) const; void wait(Stage stage_mask) { std::unique_lock lock(m_infer_mutex); @@ -84,12 +85,13 @@ class Graph final { std::shared_ptr m_network; std::map primitiveIDs; - std::map> prevPrimitiveIDs; + std::map> inputPrimitiveIDs; + std::map prevPrimitiveIDs; std::map> perfMap; std::vector profilingIDs; - std::map m_input_layouts; + std::map m_input_layouts; void build(std::shared_ptr program); std::shared_ptr get_runtime_model(std::vector& pi, bool filter_const_primitives = true); diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp index e92feaa31f58a6..c0fb697e6a6c2d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp @@ -6,6 +6,7 @@ #include "openvino/core/node.hpp" #include "openvino/runtime/profiling_info.hpp" +#include "openvino/op/parameter.hpp" #include "intel_gpu/plugin/custom_layer.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -91,22 +92,27 @@ class ProgramBuilder final { static const cldnn::primitive_id m_postCustomLayerTag; std::map primitive_ids; - std::map> prevPrimitiveIDs; + std::map> inputPrimitiveIDs; + std::map prevPrimitiveIDs; std::map> perfMap; std::vector profiling_ids; - std::map inputLayouts; + std::map inputLayouts; using BlobCacheKey = std::tuple; std::map blobMemCache; std::shared_ptr get_compiled_program() const; std::shared_ptr get_topology() const { return m_topology; } - const std::map& get_input_layouts() const { return inputLayouts; } + const std::map& get_input_layouts() const { return inputLayouts; } cldnn::engine& get_engine() const { return m_engine; } const ExecutionConfig& get_config() const { return m_config; } + int64_t get_parameter_index(const std::shared_ptr& parameter) const; + int64_t get_result_index(const ov::Output& value) const; + int64_t get_result_index(const ov::Output& value) const; + bool is_op_supported(const std::shared_ptr& op); // Profiling utils @@ -143,6 +149,7 @@ class ProgramBuilder final { private: static factories_map_t factories_map; std::shared_ptr m_program; + std::shared_ptr m_model; ExecutionConfig m_config; cldnn::engine& m_engine; static std::mutex m_mutex; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp index eb3697e4f3e4ab..dc2a4dc9143b0a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp @@ -69,15 +69,16 @@ class SyncInferRequest : public ov::ISyncInferRequest { private: void check_tensors() const override; - std::unordered_map m_user_inputs; - std::unordered_map m_user_outputs; + std::unordered_map m_user_inputs; + std::unordered_map m_user_outputs; - std::unordered_map m_plugin_inputs; - std::unordered_map m_plugin_outputs; + std::unordered_map m_plugin_inputs; + std::unordered_map m_plugin_outputs; - std::unordered_map> m_input_ports_map; - std::unordered_map> m_output_ports_map; - std::unordered_map m_output_names_map; + std::unordered_map> m_input_ports_map; + std::unordered_map> m_output_ports_map; + + std::unordered_map m_output_names_map; std::map m_internal_outputs; VariablesMap m_variables; @@ -90,9 +91,12 @@ class SyncInferRequest : public ov::ISyncInferRequest { bool m_use_external_queue = false; void prepare_state(const std::string& name, const std::shared_ptr& variable); - std::vector prepare_input(const std::string& name, const ov::Output& port, const TensorWrapper& user_tensor_wrapper); - std::vector prepare_output(const std::string& name, const ov::Output& port, const TensorWrapper& user_tensor_wrapper); - std::vector prepare_batched_input(const std::string& name, + std::vector prepare_input(const std::string& internal_name, + size_t input_idx, + const ov::Output& port, + const TensorWrapper& user_tensor_wrapper); + std::vector prepare_output(size_t output_idx, const ov::Output& port, const TensorWrapper& user_tensor_wrapper); + std::vector prepare_batched_input(size_t input_idx, const ov::Output& port, const std::vector>& user_tensors); @@ -108,8 +112,8 @@ class SyncInferRequest : public ov::ISyncInferRequest { void allocate_inputs(); void allocate_outputs(); void allocate_states(); - void allocate_input(const ov::Output& port, const std::string& name); - void allocate_output(const ov::Output& port, const std::string& name); + void allocate_input(const ov::Output& port, size_t input_idx); + void allocate_output(const ov::Output& port, size_t output_idx); cldnn::event::ptr copy_output_data(cldnn::memory::ptr src, const ov::ITensor& dst) const; void init_mappings(); diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 6a18faaa978c3c..a2cb3eabe2fb33 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -647,8 +647,6 @@ void cldnn::network::check_names() { } std::shared_ptr cldnn::network::find_primitive(const primitive_id& id) const { - std::shared_ptr ret; - if (_primitives.find(id) != _primitives.end()) return _primitives.at(id); diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index 44e4f78e509f7e..b0f7011075b263 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -45,6 +45,7 @@ Graph::Graph(std::shared_ptr model, const RemoteContextImpl::Ptr& con build(program_builder->get_compiled_program()); primitiveIDs = program_builder->primitive_ids; + inputPrimitiveIDs = program_builder->inputPrimitiveIDs; prevPrimitiveIDs = program_builder->prevPrimitiveIDs; profilingIDs = program_builder->profiling_ids; perfMap = program_builder->perfMap; @@ -67,6 +68,7 @@ Graph::Graph(cldnn::BinaryInputBuffer &ib, const RemoteContextImpl::Ptr& context ib >> m_input_layouts; ib >> primitiveIDs; + ib >> inputPrimitiveIDs; ib >> prevPrimitiveIDs; ib >> profilingIDs; { @@ -104,6 +106,7 @@ Graph::Graph(std::shared_ptr graph, uint16_t stream_id) , m_config(graph->m_config) , m_stream_id(stream_id) , primitiveIDs(graph->primitiveIDs) + , inputPrimitiveIDs(graph->inputPrimitiveIDs) , prevPrimitiveIDs(graph->prevPrimitiveIDs) , perfMap(graph->perfMap) , profilingIDs(graph->profilingIDs) @@ -444,6 +447,7 @@ void Graph::export_model(cldnn::BinaryOutputBuffer &ob) { ob << m_input_layouts; ob << primitiveIDs; + ob << inputPrimitiveIDs; ob << prevPrimitiveIDs; ob << profilingIDs; { @@ -739,29 +743,35 @@ std::shared_ptr Graph::get_network() const { return m_network; } -std::string Graph::out_name_to_internal(std::string out_port_name) const { - auto networkOutputsIDs = get_network()->get_output_ids(); - auto allPrimitiveIds = get_network()->get_all_primitives(); +std::vector Graph::input_port_index_to_internal(size_t input_port_index) const { + OPENVINO_ASSERT(inputPrimitiveIDs.count(input_port_index) != 0 && !inputPrimitiveIDs.at(input_port_index).empty(), + "[GPU] Internal name of input primitive not found at index ", input_port_index); + return inputPrimitiveIDs.at(input_port_index); +} + +std::string Graph::out_port_index_to_internal(size_t out_port_index) const { + const auto& networkOutputsIDs = get_network()->get_output_ids(); + auto check_output = [&networkOutputsIDs](const cldnn::primitive_id& id) { + return std::find(networkOutputsIDs.begin(), networkOutputsIDs.end(), id) != networkOutputsIDs.end(); + }; + + OPENVINO_ASSERT(prevPrimitiveIDs.count(out_port_index) != 0, + "[GPU] Internal name of output primitive not found for index ", out_port_index); + cldnn::primitive_id outputID = prevPrimitiveIDs.at(out_port_index); - // Find correct output ID. Start with name stored in IR. - if (primitiveIDs.find(out_port_name) == primitiveIDs.end()) { - OPENVINO_THROW("output with name ", out_port_name, " was not found in primitiveIDs"); + if (check_output(outputID)) { + return outputID; } - std::string outputID = primitiveIDs.at(out_port_name); - while (std::find(networkOutputsIDs.begin(), networkOutputsIDs.end(), outputID) == networkOutputsIDs.end()) { - // If current ID isn't found in cldnn network outputs, get previous primitive id and try again. - auto prim = allPrimitiveIds.find(outputID); - if (prim == allPrimitiveIds.end()) { - OPENVINO_THROW("Unknown primitive id ", outputID); - } - if (prevPrimitiveIDs.at(outputID).size() != 1 || prim->second != "_optimized_") { - OPENVINO_THROW("Unable to find parent for output primitive ", outputID); - } - outputID = prevPrimitiveIDs.at(outputID)[0]; + OPENVINO_ASSERT(primitiveIDs.find(outputID) != primitiveIDs.end(), + "[GPU] Output with name ", outputID, " was not found in primitiveIDs"); + outputID = primitiveIDs.at(outputID); + + if (check_output(outputID)) { + return outputID; } - return outputID; + OPENVINO_THROW("[GPU] Unable to map output port index ", out_port_index, " to the internal primitive id"); } } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp index 63e0decb32fac8..ed2963b8f4b6aa 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp @@ -34,6 +34,8 @@ static void CreateParameterOp(ProgramBuilder& p, const std::shared_ptr &node) { return ov::is_type(node) || @@ -83,6 +85,7 @@ static void CreateParameterOp(ProgramBuilder& p, const std::shared_ptr surfaces_inputs; @@ -90,8 +93,8 @@ static void CreateParameterOp(ProgramBuilder& p, const std::shared_ptr 1) suffix = "_" + std::to_string(i); std::string batched_name = input_name + suffix; - p.inputLayouts.insert({ op->get_friendly_name() + suffix, input_layout }); p.add_primitive(*op, cldnn::input_layout(batched_name, input_layout)); + p.inputPrimitiveIDs[port_index].emplace_back(batched_name); auto reorder_layout = input_layout; reorder_layout.format = cldnn::format::bfyx; @@ -111,9 +114,10 @@ static void CreateParameterOp(ProgramBuilder& p, const std::shared_ptrget_friendly_name(), input_layout }); p.add_primitive(*op, cldnn::input_layout(input_name, input_layout)); + p.inputPrimitiveIDs[port_index] = { input_name }; + p.inputLayouts.insert({ port_index, input_layout }); if (connected_to_quantize(op)) { // Techically this reorder is not needed, but for some reason it impacts layout propagation logic diff --git a/src/plugins/intel_gpu/src/plugin/ops/result.cpp b/src/plugins/intel_gpu/src/plugin/ops/result.cpp index e0368c0b881c1c..a3206c2b879643 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/result.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/result.cpp @@ -20,12 +20,9 @@ static void CreateResultOp(ProgramBuilder& p, const std::shared_ptrget_input_node_shared_ptr(0); - auto input_id = ov::op::util::get_ie_output_name(op->get_input_source_output(0)); - if (input_id.empty()) { - input_id = prev->get_friendly_name(); - if (prev->get_output_size() > 1) { - input_id += "." + std::to_string(op->get_input_source_output(0).get_index()); - } + auto input_id = prev->get_friendly_name(); + if (prev->get_output_size() > 1) { + input_id += "." + std::to_string(op->get_input_source_output(0).get_index()); } auto inputs = p.GetInputInfo(op); @@ -39,8 +36,11 @@ static void CreateResultOp(ProgramBuilder& p, const std::shared_ptrget_friendly_name()}); - p.prevPrimitiveIDs[out_primitive_name] = {input_id}; + p.add_primitive(*op, reorder_primitive, { input_id, op->get_friendly_name() }); + + int64_t port_index = p.get_result_index(op); + OPENVINO_ASSERT(port_index != -1, "[GPU] Result port index for ", input_id, " not found"); + p.prevPrimitiveIDs[port_index] = input_id; } REGISTER_FACTORY_IMPL(v0, Result); diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index e931746d6d92f9..28a2e6d584f967 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -655,7 +655,7 @@ uint32_t Plugin::get_max_batch_size(const ov::AnyMap& options) const { auto cloned_model = model->clone(); try { - std::set> batched_inputs; + std::set> batched_inputs; auto tmp_model = cloned_model->clone(); ov::pass::Manager m; @@ -675,11 +675,10 @@ uint32_t Plugin::get_max_batch_size(const ov::AnyMap& options) const { if (shape.size()) { for (size_t s = 0; s < shape.size(); s++) { if (ov::DimensionTracker::get_label(shape[s])) { - // batched dim for the input - auto batched_input_id = ov::op::util::get_ie_output_name(params[input_id]->output(0)); - GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id + batched_inputs.insert(std::make_pair(input_id, s)); + GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] detected batched input " << input->get_friendly_name() + << " with index " << input_id << "[" << s << "]" << std::endl; - batched_inputs.insert(std::make_pair(batched_input_id, s)); } } } @@ -691,9 +690,11 @@ uint32_t Plugin::get_max_batch_size(const ov::AnyMap& options) const { } try { - std::map shapes; - for (auto& param : cloned_model->get_parameters()) { - shapes[ov::op::util::get_ie_output_name(param->output(0))] = param->get_output_partial_shape(0); + std::map shapes; + const auto& params = cloned_model->get_parameters(); + for (size_t input_id = 0; input_id < params.size(); input_id++) { + const auto& param = params[input_id]; + shapes[input_id] = param->get_output_partial_shape(0); } for (const auto& input : batched_inputs) shapes[input.first][input.second] = base_batch_size; diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 7cd8349d120d56..3511b97056f25b 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -58,7 +58,8 @@ ProgramBuilder::ProgramBuilder(std::shared_ptr model, cldnn::engine& std::shared_ptr task_executor, std::shared_ptr compilation_context, bool is_inner_program) - : m_config(config) + : m_model(model) + , m_config(config) , m_engine(engine) , queryMode(false) , m_task_executor(task_executor) @@ -356,6 +357,18 @@ bool ProgramBuilder::requires_new_shape_infer(const std::shared_ptr& o return false; } +int64_t ProgramBuilder::get_parameter_index(const std::shared_ptr& parameter) const { + return m_model->get_parameter_index(parameter); +} + +int64_t ProgramBuilder::get_result_index(const ov::Output& value) const { + return m_model->get_result_index(value); +} + +int64_t ProgramBuilder::get_result_index(const ov::Output& value) const { + return m_model->get_result_index(value); +} + // TODO: Does it make sense to add such method to ov core? bool IsNodeOnConstPath(const std::shared_ptr& node) { std::set> nodes_processed = {}; diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 8b21669f2d8466..b42d0e59e6beb9 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -46,20 +46,6 @@ inline bool can_use_usm_host(const cldnn::engine& engine) { return can_use_usm; } -inline std::string get_port_name(const ov::Output& port) { - std::string name = {}; - if (name.empty()) { - bool is_input = ov::op::util::is_parameter(port.get_node()); - if (is_input) { - name = ov::op::util::get_ie_output_name(port); - } else { - const auto node = port.get_node_shared_ptr(); - name = ov::op::util::get_ie_output_name(node->input_value(0)); - } - } - return name; -} - bool is_convert_required(ov::element::Type src_et, ov::element::Type dst_et) { return src_et != dst_et && !(dst_et == ov::element::boolean && src_et == ov::element::u8); } @@ -150,48 +136,48 @@ std::vector> SyncInferRequest::query_state() const void SyncInferRequest::set_tensor(const ov::Output& port, const ov::SoPtr& tensor) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::set_tensor"); - const auto name = get_port_name(port); + const auto& port_info = find_port(port); + size_t port_index = port_info.idx; const auto& shape = port.get_partial_shape(); - OPENVINO_ASSERT(tensor != nullptr, "[GPU] Failed to set empty tensor to port: \'", name, "\'"); + OPENVINO_ASSERT(tensor != nullptr, "[GPU] Failed to set empty tensor to port with index: \'", port_index, "\'"); OPENVINO_ASSERT(port.get_element_type() == tensor->get_element_type(), "[GPU] Mismtach tensor and port type: ", port.get_element_type(), " vs ", tensor->get_element_type()); OPENVINO_ASSERT(shape.compatible(ov::PartialShape(tensor->get_shape())) || tensor->get_shape() == ov::Shape {0} || port.get_partial_shape().is_dynamic(), - "[GPU] The tensor size is not equal to model, can't set input tensor with name: ", - name, + "[GPU] The tensor size is not equal to model, can't set input tensor with index: ", + port_index, ", because model input (shape=", shape, ") and tensor (shape=", tensor->get_shape(), ") are incompatible"); - bool is_input = ov::op::util::is_parameter(port.get_node()); - - auto update_tensors_maps = [](const std::string& name, - std::unordered_map& user_tensors, - std::unordered_map& plugin_tensors, + auto update_tensors_maps = [](size_t port_index, + std::unordered_map& user_tensors, + std::unordered_map& plugin_tensors, const ov::SoPtr& tensor) { - auto current_tensor_owner = user_tensors[name].owner; - auto is_same_tensor = user_tensors[name].ptr == tensor._ptr; + auto current_tensor_owner = user_tensors[port_index].owner; + auto is_same_tensor = user_tensors[port_index].ptr == tensor._ptr; // Keep PLUGIN as a tensor owner if current user's tensor owner is PLUGIN and underlying tensor pointer is not changed auto new_tensor_owner = current_tensor_owner == TensorOwner::PLUGIN && is_same_tensor ? TensorOwner::PLUGIN : TensorOwner::USER; - user_tensors[name] = { tensor._ptr, new_tensor_owner }; + user_tensors[port_index] = { tensor._ptr, new_tensor_owner }; // We need to properly handle PLUGIN -> USER ownership change to prevent invalid PLUGIN's ush_host buffer sharing, // so remove plugin's tensor to reallocate it in prepare_input() mehtod if (current_tensor_owner == TensorOwner::PLUGIN && new_tensor_owner == TensorOwner::USER) { - if (plugin_tensors.count(name) && std::dynamic_pointer_cast(plugin_tensors[name].ptr)->is_shared()) - plugin_tensors.erase(plugin_tensors.find(name)); + if (plugin_tensors.count(port_index) && std::dynamic_pointer_cast(plugin_tensors[port_index].ptr)->is_shared()) + plugin_tensors.erase(plugin_tensors.find(port_index)); } }; + bool is_input = port_info.type == ov::ISyncInferRequest::FoundPort::Type::INPUT; if (is_input) { - update_tensors_maps(name, m_user_inputs, m_plugin_inputs, tensor); + update_tensors_maps(port_index, m_user_inputs, m_plugin_inputs, tensor); } else { - update_tensors_maps(name, m_user_outputs, m_plugin_outputs, tensor); + update_tensors_maps(port_index, m_user_outputs, m_plugin_outputs, tensor); } ov::ISyncInferRequest::set_tensor(port, tensor); @@ -201,7 +187,8 @@ void SyncInferRequest::set_tensors_impl(const ov::Output port, c if (tensors.size() == 1) { return set_tensor(port, tensors[0]); } - bool is_input = ov::op::util::is_parameter(port.get_node()); + const auto& port_info = find_port(port); + bool is_input = port_info.type == ov::ISyncInferRequest::FoundPort::Type::INPUT; OPENVINO_ASSERT(is_input, "[GPU] set_tensors_impl is not supported for output port"); bool is_remote = all_remote_buffers(tensors) || all_remote_surfaces(tensors); @@ -209,24 +196,22 @@ void SyncInferRequest::set_tensors_impl(const ov::Output port, c OPENVINO_ASSERT(is_host || is_remote, "[GPU] Incorrect input blobs. All blobs must be of the same type"); - for (const auto& input : get_inputs()) { - if (input == port) { - m_batched_tensors[input.get_tensor_ptr()] = tensors; - return; - } - } - OPENVINO_THROW("[GPU] Cannot find input tensors for port ", port); + size_t port_index = port_info.idx; + OPENVINO_ASSERT(m_input_ports_map.count(port_index) != 0, "[GPU] Cannot find input tensors for port ", port, " with index ", port_index); + const auto& tensor = m_input_ports_map.at(port_index).get_tensor_ptr(); + m_batched_tensors[tensor] = tensors; } ov::SoPtr SyncInferRequest::get_tensor(const ov::Output& port) const { - bool is_input = ov::op::util::is_parameter(port.get_node()); - const auto name = get_port_name(port); + const auto& port_info = find_port(port); + bool is_input = port_info.type == ov::ISyncInferRequest::FoundPort::Type::INPUT; + size_t port_index = port_info.idx; if (is_input) { - OPENVINO_ASSERT(m_user_inputs.count(name) == 1, "[GPU] Input tensor with name ", name, " is not found"); - return { m_user_inputs.at(name).ptr, nullptr }; + OPENVINO_ASSERT(m_user_inputs.count(port_index) == 1, "[GPU] Input tensor with index ", port_index, " is not found"); + return { m_user_inputs.at(port_index).ptr, nullptr }; } else { - OPENVINO_ASSERT(m_user_outputs.count(name) == 1, "[GPU] Output tensor with name ", name, " is not found"); - return { m_user_outputs.at(name).ptr, nullptr }; + OPENVINO_ASSERT(m_user_outputs.count(port_index) == 1, "[GPU] Output tensor with index ", port_index, " is not found"); + return { m_user_outputs.at(port_index).ptr, nullptr }; } } @@ -265,23 +250,24 @@ void SyncInferRequest::enqueue() { std::vector dependencies; for (const auto& it : m_input_ports_map) { - const auto& name = it.first; + size_t port_idx = it.first; const auto& port = it.second; if (m_batched_tensors.count(port.get_tensor_ptr()) > 0) { - auto events = prepare_batched_input(name, port, m_batched_tensors.at(port.get_tensor_ptr())); + auto events = prepare_batched_input(port_idx, port, m_batched_tensors.at(port.get_tensor_ptr())); std::move(events.begin(), events.end(), std::back_inserter(dependencies)); } else { - auto events = prepare_input(name, port, m_user_inputs.at(name)); + cldnn::primitive_id internal_name = m_graph->input_port_index_to_internal(port_idx)[0]; + auto events = prepare_input(internal_name, port_idx, port, m_user_inputs.at(port_idx)); std::move(events.begin(), events.end(), std::back_inserter(dependencies)); } } for (const auto& it : m_output_ports_map) { - const auto& name = it.first; + size_t port_idx = it.first; const auto& port = it.second; - auto events = prepare_output(name, port, m_user_outputs.at(name)); + auto events = prepare_output(port_idx, port, m_user_outputs.at(port_idx)); std::move(events.begin(), events.end(), std::back_inserter(dependencies)); } @@ -320,9 +306,9 @@ void SyncInferRequest::wait() { std::vector copy_events; for (const auto& it : m_output_ports_map) { - const auto& name = it.first; + size_t port_idx = it.first; const auto& port = it.second; - cldnn::primitive_id internal_name = m_output_names_map.at(name); + cldnn::primitive_id internal_name = m_output_names_map.at(port_idx); auto output_memory = m_internal_outputs.at(internal_name).get_memory(do_sync_per_output); auto output_layout = m_internal_outputs.at(internal_name).get_layout(); @@ -330,20 +316,22 @@ void SyncInferRequest::wait() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::wait::reinterpret_memory"); OPENVINO_ASSERT(!output_memory->get_layout().data_padding, "[GPU] Unexpected padding in output buffer"); output_memory = m_graph->get_engine().reinterpret_buffer(*output_memory, output_layout); - GPU_DEBUG_TRACE_DETAIL << name << " model output: " << output_memory->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << internal_name << " model output with index " << port_idx << ": " << output_memory->buffer_ptr() << std::endl; } - OPENVINO_ASSERT(m_user_outputs.count(name) > 0, "[GPU] Output ", name, " is not found in output tensors map"); - auto output_tensor_wrapper = m_user_outputs.at(name); + OPENVINO_ASSERT(m_user_outputs.count(port_idx) > 0, "[GPU] Output index ", port_idx, " is not found in output tensors map"); + auto output_tensor_wrapper = m_user_outputs.at(port_idx); auto output_tensor = output_tensor_wrapper.ptr; auto remote_ptr = std::dynamic_pointer_cast(output_tensor); bool is_remote = remote_ptr != nullptr; bool is_dynamic = port.get_partial_shape().is_dynamic(); if (is_remote) { - GPU_DEBUG_TRACE_DETAIL << name << " handle output tensor (remote): " << remote_ptr->get_original_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << internal_name << " handle output tensor (remote) with index: " << port_idx << ": " + << remote_ptr->get_original_memory()->buffer_ptr() << std::endl; } else { - GPU_DEBUG_TRACE_DETAIL << name << " handle output tensor (host): " << output_tensor->data() << std::endl; + GPU_DEBUG_TRACE_DETAIL << internal_name << " handle output tensor (host) with index: " << port_idx << ": " + << output_tensor->data() << std::endl; } OPENVINO_ASSERT(output_tensor_wrapper.owner == TensorOwner::PLUGIN || is_dynamic || output_tensor_wrapper.actual_size >= output_memory->size(), @@ -370,7 +358,8 @@ void SyncInferRequest::wait() { need_reallocate = output_tensor_wrapper.actual_size < output_memory->size(); if (need_reallocate) { - auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), *m_shape_predictor); + std::string internal_name = m_output_names_map.at(port_idx); + auto actual_memory_shape = predict_shape(internal_name, mem_shape, output_tensor->get_element_type(), *m_shape_predictor); output_tensor->set_shape(actual_memory_shape); } } @@ -384,7 +373,7 @@ void SyncInferRequest::wait() { auto dst_ptr = static_cast(output_tensor->data()); bool same_mem = same_host_mem(output_memory, dst_ptr); if (!same_mem && output_memory->size()) { - GPU_DEBUG_TRACE_DETAIL << name << " copy from: " << output_memory->buffer_ptr() << " to " + GPU_DEBUG_TRACE_DETAIL << internal_name << " with index " << port_idx << " copy from: " << output_memory->buffer_ptr() << " to " << (!is_remote ? output_tensor->data() : remote_ptr->get_original_memory()->buffer_ptr()) << std::endl; if (auto ev = copy_output_data(output_memory, *output_tensor)) { copy_events.push_back(ev); @@ -515,29 +504,29 @@ cldnn::event::ptr SyncInferRequest::copy_output_data(cldnn::memory::ptr src, con } } -void SyncInferRequest::allocate_input(const ov::Output& port, const std::string& name) { +void SyncInferRequest::allocate_input(const ov::Output& port, size_t input_idx) { const auto& shape = port.get_partial_shape(); auto element_type = port.get_element_type(); - m_user_inputs[name] = { create_host_tensor(shape, element_type), TensorOwner::PLUGIN }; - ov::ISyncInferRequest::set_tensor(port, m_user_inputs.at(name).ptr); + m_user_inputs[input_idx] = { create_host_tensor(shape, element_type), TensorOwner::PLUGIN }; + ov::ISyncInferRequest::set_tensor(port, m_user_inputs.at(input_idx).ptr); } -void SyncInferRequest::allocate_output(const ov::Output& port, const std::string& name) { +void SyncInferRequest::allocate_output(const ov::Output& port, size_t output_idx) { const auto& shape = port.get_partial_shape(); auto element_type = port.get_element_type(); - m_user_outputs[name] = { create_host_tensor(shape, element_type), TensorOwner::PLUGIN }; - ov::ISyncInferRequest::set_tensor(port, m_user_outputs.at(name).ptr); + m_user_outputs[output_idx] = { create_host_tensor(shape, element_type), TensorOwner::PLUGIN }; + ov::ISyncInferRequest::set_tensor(port, m_user_outputs.at(output_idx).ptr); } void SyncInferRequest::allocate_inputs() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::allocate_inputs"); for (const auto& it : m_input_ports_map) { - const auto& name = it.first; + size_t input_idx = it.first; const auto& port = it.second; - GPU_DEBUG_LOG << "[init " << name << " input blob]" << std::endl; + GPU_DEBUG_LOG << "[init input blob with index: " << input_idx << "]" << std::endl; bool is_nv12_input = false; if (port.get_rt_info().count(ov::preprocess::TensorInfoMemoryType::get_type_info_static())) { @@ -549,7 +538,7 @@ void SyncInferRequest::allocate_inputs() { } if (!is_nv12_input) { - allocate_input(port, name); + allocate_input(port, input_idx); } } } @@ -559,11 +548,11 @@ void SyncInferRequest::allocate_outputs() { // allocate outputs for (const auto& it : m_output_ports_map) { - const auto& name = it.first; + size_t output_idx = it.first; const auto& port = it.second; - GPU_DEBUG_LOG << "[init " << name << " output blob]" << std::endl; + GPU_DEBUG_LOG << "[init output blob with index: " << output_idx << "]" << std::endl; - allocate_output(port, name); + allocate_output(port, output_idx); } } @@ -596,12 +585,13 @@ void SyncInferRequest::prepare_state(const std::string& name, const std::shared_ m_graph->get_network()->set_variable(name, variable); } -std::vector SyncInferRequest::prepare_batched_input(const std::string& name, +std::vector SyncInferRequest::prepare_batched_input(size_t input_idx, const ov::Output& port, const std::vector>& user_tensors) { std::vector ret_events; bool is_host = all_host_tensors(user_tensors); bool is_remote_buffer = all_remote_buffers(user_tensors); + const cldnn::primitive::primitive_id_arr& internal_names = m_graph->input_port_index_to_internal(input_idx); // Host buffers are merged to single tensor if (is_host || is_remote_buffer) { auto tmp_shape = user_tensors.at(0)->get_shape(); @@ -627,12 +617,12 @@ std::vector SyncInferRequest::prepare_batched_input(const std } } - auto events = prepare_input(name, port, {merged_tensor, TensorOwner::PLUGIN}); + auto events = prepare_input(internal_names[0], input_idx, port, {merged_tensor, TensorOwner::PLUGIN}); std::move(events.begin(), events.end(), std::back_inserter(ret_events)); } else { + OPENVINO_ASSERT(user_tensors.size() == internal_names.size(), "[GPU] Internal names and user tensors size mismatch"); for (size_t i = 0; i < user_tensors.size(); i++) { - auto new_name = name + "_" + std::to_string(i); - auto events = prepare_input(new_name, port, {user_tensors[i]._ptr, TensorOwner::USER}); + auto events = prepare_input(internal_names[i], input_idx, port, {user_tensors[i]._ptr, TensorOwner::USER}); std::move(events.begin(), events.end(), std::back_inserter(ret_events)); } } @@ -640,10 +630,11 @@ std::vector SyncInferRequest::prepare_batched_input(const std return ret_events; } -std::vector SyncInferRequest::prepare_input(const std::string& name, +std::vector SyncInferRequest::prepare_input(const std::string& internal_name, + size_t input_idx, const ov::Output& port, const TensorWrapper& user_tensor_wrapper) { - OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, openvino::itt::handle("SyncInferRequest::prepare_input: " + name)); + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, openvino::itt::handle("SyncInferRequest::prepare_input: " + internal_name)); auto pshape = port.get_partial_shape(); auto is_dynamic = pshape.is_dynamic(); auto user_tensor = user_tensor_wrapper.ptr; @@ -654,7 +645,7 @@ std::vector SyncInferRequest::prepare_input(const std::string bool is_remote = remote_ptr != nullptr; bool is_usm_host_tensor = usm_host_ptr != nullptr; - GPU_DEBUG_TRACE_DETAIL << "Prepare input for " << name << " ( is_remote ? " << is_remote << ")" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "Prepare input for " << internal_name << " ( is_remote ? " << is_remote << ")" << std::endl; GPU_DEBUG_TRACE_DETAIL << " port shape : " << pshape.to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << " user_tensor shape: " << user_tensor->get_shape().to_string() << std::endl; @@ -664,7 +655,7 @@ std::vector SyncInferRequest::prepare_input(const std::string OPENVINO_ASSERT(pshape.compatible(ov::PartialShape(user_tensor->get_shape())) || is_batched_input(port), "[GPU] The input tensor size is not equal to model port shape, can't handle input tensor with name: ", - name, + internal_name, ", because model input (shape=", pshape, ") and tensor (shape=", @@ -676,68 +667,70 @@ std::vector SyncInferRequest::prepare_input(const std::string if (is_remote) { if (convert_needed) { - m_plugin_inputs[name] = { create_device_tensor(pshape, - cldnn::element_type_to_data_type(element_type), - false), TensorOwner::PLUGIN }; + m_plugin_inputs[input_idx] = { create_device_tensor(pshape, + cldnn::element_type_to_data_type(element_type), + false), TensorOwner::PLUGIN }; } else { - m_plugin_inputs[name] = user_tensor_wrapper; + m_plugin_inputs[input_idx] = user_tensor_wrapper; } } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) { if (element_type != cldnn::element_type_to_data_type(element_type)) { - m_plugin_inputs[name] = {std::make_shared(m_context, - user_tensor->get_shape(), - cldnn::element_type_to_data_type(element_type), - TensorType::BT_USM_SHARED, - user_tensor->data()), TensorOwner::USER }; + m_plugin_inputs[input_idx] = { std::make_shared(m_context, + user_tensor->get_shape(), + cldnn::element_type_to_data_type(element_type), + TensorType::BT_USM_SHARED, + user_tensor->data()), TensorOwner::USER }; } else { - m_plugin_inputs[name] = {usm_host_ptr->get_impl(), user_tensor_wrapper.owner}; + m_plugin_inputs[input_idx] = { usm_host_ptr->get_impl(), user_tensor_wrapper.owner }; } is_remote = true; } auto user_tensor_mem_type = cldnn::allocation_type::unknown; - if (!is_remote) + if (!is_remote) { user_tensor_mem_type = engine.detect_usm_allocation_type(user_tensor_wrapper.ptr->data()); + } auto plugin_tensor_mem_type = cldnn::allocation_type::unknown; - if (m_plugin_inputs.count(name)) - plugin_tensor_mem_type = std::dynamic_pointer_cast(m_plugin_inputs[name].ptr)->get_original_memory()->get_allocation_type(); + if (m_plugin_inputs.count(input_idx)) { + plugin_tensor_mem_type = std::dynamic_pointer_cast(m_plugin_inputs[input_idx].ptr)->get_original_memory()->get_allocation_type(); + } // Note: currently, using USM Host memory for dGPUs in some scenarios (LLMs) leads to performance degradation, // so apply wider USM Host memory type detection only for iGPUs auto usm_host_raw_ptr = engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu && user_tensor_mem_type == cldnn::allocation_type::usm_host; - bool update_device_tensor = (m_plugin_inputs.count(name) == 0) || - (m_plugin_inputs[name].owner == TensorOwner::USER && !is_remote) || + bool update_device_tensor = (m_plugin_inputs.count(input_idx) == 0) || + (m_plugin_inputs[input_idx].owner == TensorOwner::USER && !is_remote) || (plugin_tensor_mem_type != cldnn::allocation_type::usm_host && usm_host_raw_ptr); if (update_device_tensor) { // If device input hasn't been created, then try to use user memory if it's usm_host, or allocate new device buffer - m_plugin_inputs[name] = create_or_share_device_tensor(user_tensor_wrapper, name, pshape, device_tensor_et, convert_needed); + m_plugin_inputs[input_idx] = create_or_share_device_tensor(user_tensor_wrapper, internal_name, pshape, device_tensor_et, convert_needed); } else if (!is_remote) { // Device memory has been created on previous iterations. Try to reuse whenever it's possible - auto device_tensor_wrapper = m_plugin_inputs.at(name); + auto device_tensor_wrapper = m_plugin_inputs.at(input_idx); auto device_tensor = std::dynamic_pointer_cast(device_tensor_wrapper.ptr); if (is_dynamic) { if (device_tensor->get_original_memory()->size() < user_tensor->get_byte_size()) { - auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, *m_shape_predictor); + auto actual_shape = predict_shape(internal_name, user_tensor->get_shape(), device_tensor_et, *m_shape_predictor); GPU_DEBUG_TRACE_DETAIL << " actual memory shape: " << actual_shape.to_string() << std::endl; auto new_tensor = create_device_tensor(actual_shape, device_tensor_et, false); new_tensor->set_shape(user_tensor->get_shape()); - m_plugin_inputs[name] = { new_tensor, TensorOwner::PLUGIN }; + m_plugin_inputs[input_idx] = { new_tensor, TensorOwner::PLUGIN }; } } } - auto device_tensor = std::dynamic_pointer_cast(m_plugin_inputs.at(name).ptr); + auto device_tensor = std::dynamic_pointer_cast(m_plugin_inputs.at(input_idx).ptr); if (is_dynamic) { OPENVINO_ASSERT(device_tensor->get_original_memory()->size() >= user_tensor->get_size(), "[GPU] Size of input device tensor (=", device_tensor->get_original_memory()->size(), ") is expected to be greater or equal to user tensor (=", user_tensor->get_size(), - ") in dynamic case for ", name); + ") in dynamic case for ", internal_name); // tensor reshape below is expected to work w/o reallocation device_tensor->set_shape(user_tensor->get_shape()); } else { @@ -746,7 +739,7 @@ std::vector SyncInferRequest::prepare_input(const std::string user_tensor->get_size(), ") and device tensor (=", device_tensor->get_size(), - ") don't match for ", name, + ") don't match for ", internal_name, ". Those are expected to be equal in case of static shape of the port"); } @@ -754,7 +747,7 @@ std::vector SyncInferRequest::prepare_input(const std::string // WA to extend shape to ranks expected by legacy shape infer. Remove after full migration to new shape infer if (!m_graph->get_config().get_property(ov::intel_gpu::allow_new_shape_infer)) { auto new_layout = memory->get_layout(); - new_layout.set_partial_shape(m_graph->get_input_layouts().at(name).get_shape()); + new_layout.set_partial_shape(m_graph->get_input_layouts().at(input_idx).get_shape()); memory = engine.reinterpret_buffer(*memory, new_layout); } @@ -773,8 +766,8 @@ std::vector SyncInferRequest::prepare_input(const std::string } } - GPU_DEBUG_TRACE_DETAIL << name << " prepare input: " << memory->buffer_ptr() << " alloc_type: " << memory->get_allocation_type() << std::endl; - const cldnn::primitive_id internal_name = "parameter:" + name; + GPU_DEBUG_TRACE_DETAIL << internal_name << " with index " << input_idx << " prepare input: " << memory->buffer_ptr() + << " alloc_type: " << memory->get_allocation_type() << std::endl; network->set_input_data(internal_name, memory); if (ret_event && !ret_event->is_set()) @@ -783,7 +776,7 @@ std::vector SyncInferRequest::prepare_input(const std::string return {}; } -std::vector SyncInferRequest::prepare_output(const std::string& name, +std::vector SyncInferRequest::prepare_output(size_t output_idx, const ov::Output& port, const TensorWrapper& user_tensor_wrapper) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::prepare_output"); @@ -792,16 +785,17 @@ std::vector SyncInferRequest::prepare_output(const std::strin auto element_type = port.get_element_type(); auto user_tensor = user_tensor_wrapper.ptr; auto remote_ptr = std::dynamic_pointer_cast(user_tensor); + auto internal_name = m_output_names_map.at(output_idx); bool is_remote = remote_ptr != nullptr; - GPU_DEBUG_TRACE_DETAIL << "Prepare output for " << name << std::endl; + GPU_DEBUG_TRACE_DETAIL << "Prepare output for " << internal_name << std::endl; GPU_DEBUG_TRACE_DETAIL << " port shape : " << pshape.to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << " user_tensor shape: " << user_tensor->get_shape().to_string() << std::endl; if (user_tensor->get_size() > 0) { OPENVINO_ASSERT(pshape.compatible(ov::PartialShape(user_tensor->get_shape())), "[GPU] The output tensor size is not equal to model port shape, can't handle output tensor with name: ", - name, + internal_name, ", because model output (shape=", pshape, ") and tensor (shape=", @@ -812,40 +806,42 @@ std::vector SyncInferRequest::prepare_output(const std::strin auto network = m_graph->get_network(); auto device_tensor_et = convert_to_supported_device_type(element_type); bool convert_needed = is_convert_required(device_tensor_et, element_type); - cldnn::primitive_id internal_name = m_output_names_map.at(name); + if (is_remote && !convert_needed && !is_dynamic) { - m_plugin_outputs[name] = user_tensor_wrapper; + m_plugin_outputs[output_idx] = user_tensor_wrapper; } if (!is_dynamic) { auto is_cpu_impl = network->is_cpu_impl(internal_name); - bool has_device_buffer = m_plugin_outputs.count(name) > 0; + bool has_device_buffer = m_plugin_outputs.count(output_idx) > 0; bool update_device_tensor = !has_device_buffer || - (m_plugin_outputs[name].owner == TensorOwner::USER && !is_remote); + (m_plugin_outputs[output_idx].owner == TensorOwner::USER && !is_remote); if (update_device_tensor) { - m_plugin_outputs[name] = create_or_share_device_tensor(user_tensor_wrapper, name, pshape, device_tensor_et, is_cpu_impl || convert_needed); + m_plugin_outputs[output_idx] = + create_or_share_device_tensor(user_tensor_wrapper, internal_name, pshape, device_tensor_et, is_cpu_impl || convert_needed); } } // Missing output in _plugin_outputs means that the network is dynamic and outputs couldn't be pre-allocated - if (m_plugin_outputs.find(name) == m_plugin_outputs.end()) + if (m_plugin_outputs.find(output_idx) == m_plugin_outputs.end()) return {}; - auto output_tensor = std::dynamic_pointer_cast(m_plugin_outputs.at(name).ptr); + auto output_tensor = std::dynamic_pointer_cast(m_plugin_outputs.at(output_idx).ptr); auto output_memory = output_tensor->get_memory(); - GPU_DEBUG_TRACE_DETAIL << name << " prepare output: " << output_memory->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << internal_name << " with index " << output_idx << " prepare output: " << output_memory->buffer_ptr() << std::endl; return network->set_output_memory(internal_name, output_memory); } void SyncInferRequest::init_mappings() { - for (const auto& in : get_inputs()) { - auto port_name = get_port_name(in); - m_input_ports_map[port_name] = in; - } - for (const auto& out : get_outputs()) { - auto port_name = get_port_name(out); - m_output_ports_map[port_name] = out; - m_output_names_map[port_name] = m_graph->out_name_to_internal(port_name); + const auto& inputs = get_inputs(); + for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) { + m_input_ports_map[input_idx] = inputs[input_idx]; + } + + const auto& outputs = get_outputs(); + for (size_t output_idx = 0; output_idx < outputs.size(); ++output_idx) { + m_output_ports_map[output_idx] = outputs[output_idx]; + m_output_names_map[output_idx] = m_graph->out_port_index_to_internal(output_idx); } }