diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index cd6021a40a5a8c..3dff510e9bb44c 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -231,6 +231,7 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"Multinomial", Type::Multinomial}, {"Reference", Type::Reference}, {"Subgraph", Type::Subgraph}, + {"SubModel", Type::SubModel}, {"PriorBox", Type::PriorBox}, {"PriorBoxClustered", Type::PriorBoxClustered}, {"Interaction", Type::Interaction}, @@ -361,6 +362,7 @@ std::string NameFromType(const Type type) { CASE(Multinomial); CASE(Reference); CASE(Subgraph); + CASE(SubModel); CASE(PriorBox); CASE(PriorBoxClustered) CASE(MHA); diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index 6834225c1f2515..3c71ae3a17193f 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -7,6 +7,7 @@ #include #include +#include "transformations/cpu_opset/common/op/submodel.hpp" #include "utils/caseless.hpp" namespace ov { @@ -114,6 +115,7 @@ enum class Type { MulticlassNms, Multinomial, Subgraph, + SubModel, PriorBox, PriorBoxClustered, Interaction, diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 2ff2ee2636ec02..bce3e409dcb491 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -76,6 +76,8 @@ void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) { InitGraph(); + Allocate(); + CPU_DEBUG_CAP_ENABLE(serialize(*this)); } @@ -108,11 +110,16 @@ void Graph::CreateGraph(const std::vector& graphNodes, InitGraph(); + Allocate(); + CPU_DEBUG_CAP_ENABLE(serialize(*this)); } template void Graph::CreateGraph(const std::shared_ptr&, const GraphContext::CPtr); -void Graph::Replicate(const std::shared_ptr &model) { + +void Graph::Replicate(const std::shared_ptr &model, + const VecMemoryDescs& inputDescriptors, + bool zeroCopyOutputs) { OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model"); this->_name = model->get_friendly_name(); this->reuse_io_tensors = false; @@ -150,6 +157,11 @@ void Graph::Replicate(const std::shared_ptr &model) { if (node->isDynamicNode()) { graphHasDynamicInput = true; } + + if (!inputDescriptors.empty()) { + auto inputNode = std::dynamic_pointer_cast(node); + inputNode->setMemDesc(inputDescriptors[input_index]); + } } if (op->get_type_info() == op::v0::Result::get_type_info_static()) { @@ -159,6 +171,10 @@ void Graph::Replicate(const std::shared_ptr &model) { op->get_friendly_name(), " in model result list!"); outputNodesMap[output_index] = node; + if (zeroCopyOutputs) { + auto inputNode = std::dynamic_pointer_cast(node); + inputNode->setZeroCopyOutput(); + } } op2node[op] = node; @@ -320,8 +336,40 @@ static std::tuple, std::vector> ExtractExecutableNo std::move(executableSyncNodesInds)); } +void Graph::Configure(const std::shared_ptr& network, + const GraphContext::CPtr ctx, + const VecMemoryDescs& inputDescriptors, + const bool zeroCopyOutputs) { + OPENVINO_ASSERT(status == Status::NotReady, "Invalid graph status"); + + context = ctx; + + Replicate(network, inputDescriptors, zeroCopyOutputs); + + InitGraph(); +} + +void Graph::Allocate() { + OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + + const bool hasDynNodes = ProcessDynNodes(); + const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + + Allocate(syncNodesInds); + + CreatePrimitivesAndExecConstants(); + + CPU_DEBUG_CAP_ENABLE(for (auto &graphNode : graphNodes) { graphNode->cleanup(); }) + + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + status = hasDynNodes ? Status::ReadyDynamic : Status::ReadyStatic; + + CPU_DEBUG_CAP_ENABLE(serialize(*this)); +} + void Graph::InitGraph(bool optimize) { - DEBUG_LOG("Initializing graph with name: ", GetName()); + OPENVINO_ASSERT(status == Status::NotReady, "Invalid graph status"); GraphOptimizer optimizer; @@ -351,24 +399,7 @@ void Graph::InitGraph(bool optimize) { SortTopologically(); - const bool hasDynNodes = ProcessDynNodes(); - const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; - - Allocate(syncNodesInds); - - CreatePrimitivesAndExecConstants(); - -#ifndef CPU_DEBUG_CAPS - for (auto &graphNode : graphNodes) { - graphNode->cleanup(); - } -#endif - - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - - status = hasDynNodes ? Status::ReadyDynamic : Status::ReadyStatic; - - CPU_DEBUG_CAP_ENABLE(serialize(*this)); + status = Status::Initialized; } void Graph::InitNodes() { @@ -1122,6 +1153,18 @@ void Graph::PullOutputData(std::unordered_map>& } } +VecMemoryDescs Graph::getOutputMemoryDescriptors() { + OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + + VecMemoryDescs result; + for (const auto& output : outputNodesMap) { + const auto& node = output.second; + result.emplace_back(node->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()); + } + + return result; +} + void Graph::InferStatic(SyncInferRequest* request) { dnnl::stream stream(getEngine()); diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index 1a08446b59d9f6..0158d2e399370b 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -34,8 +34,9 @@ class Graph { enum class Status { NotReady = 0, - ReadyStatic = 1, - ReadyDynamic = 2 + Initialized = 1, + ReadyStatic = 2, + ReadyDynamic = 3, }; Graph() = default; @@ -61,6 +62,9 @@ class Graph { void PushInputData(const std::size_t& index, const ov::SoPtr& input); void PullOutputData(std::unordered_map>& output); + // Returns Output nodes memory descriptors + VecMemoryDescs getOutputMemoryDescriptors(); + void Infer(SyncInferRequest* request = nullptr); const std::vector& GetNodes() const { @@ -185,6 +189,12 @@ class Graph { Status getStatus() const {return status;} const std::unordered_map& getInternalStateNodes() const; + void Configure(const std::shared_ptr& network, + const GraphContext::CPtr ctx, + const VecMemoryDescs& inputDescriptors = {}, + const bool zeroCopyOutputs = false); + void Allocate(); + void InitGraph(bool optimize = true); protected: @@ -214,7 +224,9 @@ class Graph { bool graphHasDynamicInput = false; - void Replicate(const std::shared_ptr &subgraph); + void Replicate(const std::shared_ptr &subgraph, + const VecMemoryDescs& inputDescriptors = {}, + bool zeroCopyOutputs = false); void InitNodes(); void InitDescriptors(); void ResolveInplaceDirections(); diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 60f6206818783b..9f274177de7642 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -415,6 +415,19 @@ MemoryDescPtr Node::getBaseMemDescAtOutputPort(size_t portNum) const { OPENVINO_THROW("Can't get output memory desc, primitive descriptor is not selected"); } +MemoryDescPtr Node::getParentOutputMemDesc(const EdgePtr& edge) { + const auto parentPtr = edge->getParent(); + const auto parentSpd = parentPtr->getSelectedPrimitiveDescriptor(); + OPENVINO_ASSERT(parentSpd, "Parent selected primitive descriptor is missed"); + + const auto& parentOutConfs = parentSpd->getConfig().outConfs; + OPENVINO_ASSERT(!parentOutConfs.empty(), "Parent output configuration is empty"); + + const int inNum = edge->getInputNum(); + + return parentSpd->getConfig().outConfs[inNum].getMemDesc(); +} + std::string Node::getPrimitiveDescriptorType() const { auto selectedPrimitiveDesc = getSelectedPrimitiveDescriptor(); diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index d442c0280ab03c..d34e63bae6f0d0 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -10,6 +10,7 @@ #include "cpu_shape.h" #include "cpu_types.h" #include "edge.h" +#include "memory_desc/cpu_memory_desc.h" #include "selective_build.h" #include "memory_desc/dnnl_memory_desc.h" #include "onednn/dnnl.h" @@ -394,6 +395,13 @@ class Node { */ MemoryDescPtr getBaseMemDescAtOutputPort(size_t portNum) const; + /** + * @brief Returns parent output memory descriptor from given \p edge + * must be used after selectOptimalPrimitiveDescriptor stage + * @param edge + * @return pointer to parent output memory descriptor with type MemoryDesc + */ + static MemoryDescPtr getParentOutputMemDesc(const EdgePtr& edge); /** * @brief Returns input selected primitive descriptor on the specified port * must be used after selectOptimalPrimitiveDescriptor stage diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp new file mode 100644 index 00000000000000..da4447d868d2e9 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "composite.h" + +#include "cpu_memory.h" +#include "transformations/cpu_opset/common/op/submodel.hpp" +#include "utils/debug_capabilities.h" + +namespace ov { +namespace intel_cpu { +namespace node { + +bool Composite::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + return ov::is_type(op); +} + +Composite::Composite(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, NgraphShapeInferFactory(op, FULL_PORT_MASK)) { + const auto& subModel = ov::as_type_ptr(op); + OPENVINO_ASSERT(subModel, "Attempt to create SubGraph node from an invalid op type: ", op); + + m_body = subModel->get_function(); +} + +void Composite::selectOptimalPrimitiveDescriptor() { + // for the input configution, just always use the parent configuration + VecMemoryDescs inputDescriptors; + for (size_t j = 0; j < getParentEdges().size(); j++) { + inputDescriptors.emplace_back(getParentOutputMemDesc(getParentEdgeAt(0))); + } + + std::vector inConfs; + for (const auto& desc : inputDescriptors) { + inConfs.emplace_back(desc); + } + + // configure the inner graph to get the information about output memory descriptors + m_graph.Configure(m_body, context, inputDescriptors, true); + + // for the output decriptors, use the configuration of the graph's output nodes + auto outputDescriptors = m_graph.getOutputMemoryDescriptors(); + + std::vector outConfs; + for (const auto& desc : outputDescriptors) { + outConfs.emplace_back(desc); + } + + const NodeConfig config(inConfs, outConfs); + + supportedPrimitiveDescriptors.clear(); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + + selectPrimitiveDescriptorByIndex(0); +} + +// @todo add ascii diagramm for memory mapping / reuse +void Composite::createPrimitive() { + // Point a memory of the inner graph's input edges to the corresponding memory of the node parent edges + OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + "Number of node inputs must be equal the number of inner graph's inputs"); + + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + const auto input = m_graph.GetInputNodesMap()[i]; + + for (size_t j = 0; j < input->getChildEdges().size(); j++) { + input->getChildEdgeAt(j)->reuse(getSrcMemoryAtPort(i)); + } + } + + // Point a memory of the inner graph's output edges to the corresponding memory of the node child edges + // The extra child edges on output ports will be updated after the inference of the inner graph + OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + "Number of node inputs must be equal the number of inner graph's inputs"); + + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + const auto output = m_graph.GetOutputNodesMap()[i]; + output->getParentEdgeAt(0)->reuse(getDstMemoryAtPort(i)); + } + + // Allocate inner graph's memory + m_graph.Allocate(); +} + +void Composite::execute(dnnl::stream) { + m_graph.Infer(); + + if (!inputShapesModified()) + return; + + // since the shape inference is not performed for the composite node + // a memory of the extra child edges, attached to the output ports + // has to be updated after an inference of the inner graph finished + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + const auto mem = getDstMemoryAtPort(i); + auto& childEdges = getChildEdges(); + for (size_t j = getOriginalOutputsNumber(); j < childEdges.size(); j++) { + auto& childEdge = childEdges[j]; + auto childEdgePtr = childEdge.lock(); + if (childEdgePtr->getInputNum() == static_cast(i)) { + childEdgePtr->getMemoryPtr()->redefineDesc(mem->getDescPtr()); + } + } + } +} + +void Composite::executeDynamicImpl(dnnl::stream strm) { + execute(strm); +} + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/composite.h b/src/plugins/intel_cpu/src/nodes/composite.h new file mode 100644 index 00000000000000..f53c60881c5e66 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/composite.h @@ -0,0 +1,56 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "graph.h" +#include "node.h" + +namespace ov { +namespace intel_cpu { +namespace node { + +class Composite : public Node { +public: + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + Composite(const std::shared_ptr& op, const GraphContext::CPtr& context); + + bool created() const override { + return true; + } + + bool needShapeInfer() const override { + return false; + } + + bool needPrepareParams() const override { + return false; + } + + bool isExecutable() const override { + return true; + } + + void getSupportedDescriptors() override{}; + void selectOptimalPrimitiveDescriptor() override; + void createPrimitive() override; + void execute(dnnl::stream) override; + void executeDynamicImpl(dnnl::stream strm) override; + + const Graph& graph() const { + return m_graph; + } + +private: + std::shared_ptr m_body; + Graph m_graph; + std::shared_ptr m_executor; +}; + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 72a22132aba175..e5236c486c9bf7 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -5,6 +5,7 @@ #include "input.h" #include "cpu/x64/jit_generator.hpp" +#include "nodes/node_config.h" #include "openvino/core/parallel.hpp" #include "shape_inference/shape_inference_pass_through.hpp" @@ -470,6 +471,20 @@ void Input::initSupportedPrimitiveDescriptors() { } } +void Input::selectOptimalPrimitiveDescriptor() { + if (!(zeroCopyOutput && getType() == Type::Output)) + return Node::selectOptimalPrimitiveDescriptor(); + + // ignore previous configuration + supportedPrimitiveDescriptors.clear(); + + // and just use parent memory descriptor for Output node to avoid reorders insertion + NodeConfig config({PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)))}, {}); + + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); + selectPrimitiveDescriptorByIndex(0); +} + void Input::createPrimitive() { for (size_t i = 0; i < getChildEdges().size(); i++) { auto dstMemPtr = getDstMemoryAtPort(i); @@ -528,7 +543,6 @@ void Input::initSupportedPdFromMemDesc() { } supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 9b304e5a75a891..58edf1e8f23bd1 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -20,9 +20,14 @@ class Input : public Node { const std::string& type, const GraphContext::CPtr context); Input(MemoryDescPtr memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr context); + void setMemDesc(MemoryDescPtr memDesc) { extMemDesc = memDesc; } + void setZeroCopyOutput() { + zeroCopyOutput = true; + } void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; + void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; bool created() const override; @@ -47,6 +52,7 @@ class Input : public Node { std::shared_ptr constOp; MemoryCPtr memoryPtr; MemoryDescPtr extMemDesc = nullptr; + bool zeroCopyOutput = false; bool isMeanImage = false; }; diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp index 9012c37f5ac23b..b35f5389c0bc6c 100644 --- a/src/plugins/intel_cpu/src/nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/nodes_factory.cpp @@ -9,6 +9,7 @@ #include "nodes/bucketize.h" #include "nodes/col2im.h" #include "nodes/color_convert.h" +#include "nodes/composite.h" #include "nodes/concat.h" #include "nodes/conv.h" #include "nodes/convert.h" @@ -209,6 +210,7 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") { INTEL_CPU_NODE(RDFT, Type::RDFT); INTEL_CPU_NODE(ExtractImagePatches, Type::ExtractImagePatches); INTEL_CPU_NODE(Subgraph, Type::Subgraph); + INTEL_CPU_NODE(Composite, Type::SubModel); INTEL_CPU_NODE(ScaledDotProductAttention, Type::ScaledDotProductAttention); #if defined(OPENVINO_ARCH_X86_64) INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.cpp new file mode 100644 index 00000000000000..0f72baed2b1206 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "submodel.hpp" + +namespace ov { +namespace intel_cpu { + +SubModel::SubModel(const std::shared_ptr& body) + : SubGraphOp() { + SubGraphOp::set_function(body); +} + +SubModel::SubModel(const ov::OutputVector& args, + const std::shared_ptr& body) + : SubGraphOp(args) { + SubGraphOp::set_function(body); + constructor_validate_and_infer_types(); + for (size_t i = 0; i < body->get_parameters().size(); ++i) + m_input_descriptions[0].push_back(std::make_shared(i, i)); + for (size_t i = 0; i < body->get_output_size(); ++i) + m_output_descriptions[0].push_back(std::make_shared(i, i)); +} + +SubModel::SubModel(const ov::NodeVector& args, + const std::shared_ptr& body) + : SubModel(as_output_vector(args), body) {} + +std::shared_ptr SubModel::clone_with_new_inputs(const ov::OutputVector& inputs) const { + return std::make_shared(inputs, body().clone()); +} + +void SubModel::validate_and_infer_types() { + ov::ParameterVector old_parameters = body_ptr()->get_parameters(); + + for (size_t i = 0; i < get_input_size(); ++i) { + body_ptr()->replace_parameter( + i, + std::make_shared(get_input_element_type(i), get_input_partial_shape(i))); + } + + body_ptr()->validate_nodes_and_infer_types(); + + for (size_t i = 0; i < body_ptr()->get_parameters().size(); i++) { + body_ptr()->get_parameters()[i]->set_friendly_name(old_parameters[i]->get_friendly_name()); + } + + set_output_size(body_ptr()->get_output_size()); + for (size_t i = 0; i < get_output_size(); ++i) { + set_output_type(i, body_ptr()->get_output_element_type(i), body_ptr()->get_output_partial_shape(i)); + } +} + +bool SubModel::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("body", body_ptr()); + visitor.on_attribute("input_descriptions", m_input_descriptions[0]); + visitor.on_attribute("output_descriptions", m_output_descriptions[0]); + return true; +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.hpp new file mode 100644 index 00000000000000..03e5e19f3424a5 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "openvino/core/model.hpp" +#include "openvino/op/op.hpp" +#include "openvino/op/util/sub_graph_base.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * @interface Subgraph + * @brief An operation that is implemented by a model + */ +class SubModel : public ov::op::util::SubGraphOp { +public: + OPENVINO_OP("SubModel", "cpu_plugin_opset"); + + SubModel() = default; + + SubModel(const std::shared_ptr& body); + + SubModel(const OutputVector& args, const std::shared_ptr& body); + + SubModel(const NodeVector& args, const std::shared_ptr& body); + + bool visit_attributes(AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + + const ov::Model& body() const { + return *m_bodies[0]; + } + const std::shared_ptr& body_ptr() const { + return m_bodies[0]; + } + +private: + ov::Model& body() { + return *m_bodies[0]; + } + std::shared_ptr& body_ptr() { + return m_bodies[0]; + } +}; + +} // namespace intel_cpu +} // namespace ov