Skip to content

Commit

Permalink
[CPU] Introduce SubModel op and Composite node
Browse files Browse the repository at this point in the history
with the idea of wrapping up (grouping) parts of the model / graph
into a inner model / graph without any (almost) runtime overhead.
SubModel op and Compoisite node are expected to perform no extra logic
and to only execute an inner model / graph
  • Loading branch information
EgorDuplensky committed Jul 5, 2024
1 parent 2a9af43 commit 32f76fb
Show file tree
Hide file tree
Showing 13 changed files with 416 additions and 24 deletions.
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/cpu_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ static const TypeToNameMap& get_type_to_name_tbl() {
{"Multinomial", Type::Multinomial},
{"Reference", Type::Reference},
{"Subgraph", Type::Subgraph},
{"SubModel", Type::SubModel},
{"PriorBox", Type::PriorBox},
{"PriorBoxClustered", Type::PriorBoxClustered},
{"Interaction", Type::Interaction},
Expand Down Expand Up @@ -361,6 +362,7 @@ std::string NameFromType(const Type type) {
CASE(Multinomial);
CASE(Reference);
CASE(Subgraph);
CASE(SubModel);
CASE(PriorBox);
CASE(PriorBoxClustered)
CASE(MHA);
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/cpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <string>
#include <vector>

#include "transformations/cpu_opset/common/op/submodel.hpp"
#include "utils/caseless.hpp"

namespace ov {
Expand Down Expand Up @@ -114,6 +115,7 @@ enum class Type {
MulticlassNms,
Multinomial,
Subgraph,
SubModel,
PriorBox,
PriorBoxClustered,
Interaction,
Expand Down
83 changes: 63 additions & 20 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) {

InitGraph();

Allocate();

CPU_DEBUG_CAP_ENABLE(serialize(*this));
}

Expand Down Expand Up @@ -108,11 +110,16 @@ void Graph::CreateGraph(const std::vector<NodePtr>& graphNodes,

InitGraph();

Allocate();

CPU_DEBUG_CAP_ENABLE(serialize(*this));
}

template void Graph::CreateGraph(const std::shared_ptr<const ov::Model>&, const GraphContext::CPtr);
void Graph::Replicate(const std::shared_ptr<const ov::Model> &model) {

void Graph::Replicate(const std::shared_ptr<const ov::Model> &model,
const VecMemoryDescs& inputDescriptors,
bool zeroCopyOutputs) {
OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model");
this->_name = model->get_friendly_name();
this->reuse_io_tensors = false;
Expand Down Expand Up @@ -150,6 +157,11 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &model) {
if (node->isDynamicNode()) {
graphHasDynamicInput = true;
}

if (!inputDescriptors.empty()) {
auto inputNode = std::dynamic_pointer_cast<node::Input>(node);
inputNode->setMemDesc(inputDescriptors[input_index]);
}
}

if (op->get_type_info() == op::v0::Result::get_type_info_static()) {
Expand All @@ -159,6 +171,10 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &model) {
op->get_friendly_name(),
" in model result list!");
outputNodesMap[output_index] = node;
if (zeroCopyOutputs) {
auto inputNode = std::dynamic_pointer_cast<node::Input>(node);
inputNode->setZeroCopyOutput();
}
}

op2node[op] = node;
Expand Down Expand Up @@ -320,8 +336,40 @@ static std::tuple<std::vector<NodePtr>, std::vector<size_t>> ExtractExecutableNo
std::move(executableSyncNodesInds));
}

void Graph::Configure(const std::shared_ptr<const ov::Model>& network,
const GraphContext::CPtr ctx,
const VecMemoryDescs& inputDescriptors,
const bool zeroCopyOutputs) {
OPENVINO_ASSERT(status == Status::NotReady, "Invalid graph status");

context = ctx;

Replicate(network, inputDescriptors, zeroCopyOutputs);

InitGraph();
}

void Graph::Allocate() {
OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status");

const bool hasDynNodes = ProcessDynNodes();
const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector<size_t>{};

Allocate(syncNodesInds);

CreatePrimitivesAndExecConstants();

CPU_DEBUG_CAP_ENABLE(for (auto &graphNode : graphNodes) { graphNode->cleanup(); })

std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes);

status = hasDynNodes ? Status::ReadyDynamic : Status::ReadyStatic;

CPU_DEBUG_CAP_ENABLE(serialize(*this));
}

void Graph::InitGraph(bool optimize) {
DEBUG_LOG("Initializing graph with name: ", GetName());
OPENVINO_ASSERT(status == Status::NotReady, "Invalid graph status");

GraphOptimizer optimizer;

Expand Down Expand Up @@ -351,24 +399,7 @@ void Graph::InitGraph(bool optimize) {

SortTopologically();

const bool hasDynNodes = ProcessDynNodes();
const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector<size_t>{};

Allocate(syncNodesInds);

CreatePrimitivesAndExecConstants();

#ifndef CPU_DEBUG_CAPS
for (auto &graphNode : graphNodes) {
graphNode->cleanup();
}
#endif

std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes);

status = hasDynNodes ? Status::ReadyDynamic : Status::ReadyStatic;

CPU_DEBUG_CAP_ENABLE(serialize(*this));
status = Status::Initialized;
}

void Graph::InitNodes() {
Expand Down Expand Up @@ -1122,6 +1153,18 @@ void Graph::PullOutputData(std::unordered_map<std::size_t, ov::SoPtr<ITensor>>&
}
}

VecMemoryDescs Graph::getOutputMemoryDescriptors() {
OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status");

VecMemoryDescs result;
for (const auto& output : outputNodesMap) {
const auto& node = output.second;
result.emplace_back(node->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc());
}

return result;
}

void Graph::InferStatic(SyncInferRequest* request) {
dnnl::stream stream(getEngine());

Expand Down
18 changes: 15 additions & 3 deletions src/plugins/intel_cpu/src/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ class Graph {

enum class Status {
NotReady = 0,
ReadyStatic = 1,
ReadyDynamic = 2
Initialized = 1,
ReadyStatic = 2,
ReadyDynamic = 3,
};

Graph() = default;
Expand All @@ -61,6 +62,9 @@ class Graph {
void PushInputData(const std::size_t& index, const ov::SoPtr<ITensor>& input);
void PullOutputData(std::unordered_map<std::size_t, ov::SoPtr<ITensor>>& output);

// Returns Output nodes memory descriptors
VecMemoryDescs getOutputMemoryDescriptors();

void Infer(SyncInferRequest* request = nullptr);

const std::vector<NodePtr>& GetNodes() const {
Expand Down Expand Up @@ -185,6 +189,12 @@ class Graph {

Status getStatus() const {return status;}
const std::unordered_map<std::string, node::MemoryStateNode*>& getInternalStateNodes() const;
void Configure(const std::shared_ptr<const ov::Model>& network,
const GraphContext::CPtr ctx,
const VecMemoryDescs& inputDescriptors = {},
const bool zeroCopyOutputs = false);
void Allocate();

void InitGraph(bool optimize = true);

protected:
Expand Down Expand Up @@ -214,7 +224,9 @@ class Graph {

bool graphHasDynamicInput = false;

void Replicate(const std::shared_ptr<const ov::Model> &subgraph);
void Replicate(const std::shared_ptr<const ov::Model> &subgraph,
const VecMemoryDescs& inputDescriptors = {},
bool zeroCopyOutputs = false);
void InitNodes();
void InitDescriptors();
void ResolveInplaceDirections();
Expand Down
13 changes: 13 additions & 0 deletions src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,19 @@ MemoryDescPtr Node::getBaseMemDescAtOutputPort(size_t portNum) const {
OPENVINO_THROW("Can't get output memory desc, primitive descriptor is not selected");
}

MemoryDescPtr Node::getParentOutputMemDesc(const EdgePtr& edge) {
const auto parentPtr = edge->getParent();
const auto parentSpd = parentPtr->getSelectedPrimitiveDescriptor();
OPENVINO_ASSERT(parentSpd, "Parent selected primitive descriptor is missed");

const auto& parentOutConfs = parentSpd->getConfig().outConfs;
OPENVINO_ASSERT(!parentOutConfs.empty(), "Parent output configuration is empty");

const int inNum = edge->getInputNum();

return parentSpd->getConfig().outConfs[inNum].getMemDesc();
}

std::string Node::getPrimitiveDescriptorType() const {
auto selectedPrimitiveDesc = getSelectedPrimitiveDescriptor();

Expand Down
8 changes: 8 additions & 0 deletions src/plugins/intel_cpu/src/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "cpu_shape.h"
#include "cpu_types.h"
#include "edge.h"
#include "memory_desc/cpu_memory_desc.h"
#include "selective_build.h"
#include "memory_desc/dnnl_memory_desc.h"
#include "onednn/dnnl.h"
Expand Down Expand Up @@ -394,6 +395,13 @@ class Node {
*/
MemoryDescPtr getBaseMemDescAtOutputPort(size_t portNum) const;

/**
* @brief Returns parent output memory descriptor from given \p edge
* must be used after selectOptimalPrimitiveDescriptor stage
* @param edge
* @return pointer to parent output memory descriptor with type MemoryDesc
*/
static MemoryDescPtr getParentOutputMemDesc(const EdgePtr& edge);
/**
* @brief Returns input selected primitive descriptor on the specified port
* must be used after selectOptimalPrimitiveDescriptor stage
Expand Down
114 changes: 114 additions & 0 deletions src/plugins/intel_cpu/src/nodes/composite.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "composite.h"

#include "cpu_memory.h"
#include "transformations/cpu_opset/common/op/submodel.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {
namespace node {

bool Composite::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
return ov::is_type<ov::intel_cpu::SubModel>(op);
}

Composite::Composite(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr& context)
: Node(op, context, NgraphShapeInferFactory(op, FULL_PORT_MASK)) {
const auto& subModel = ov::as_type_ptr<SubModel>(op);
OPENVINO_ASSERT(subModel, "Attempt to create SubGraph node from an invalid op type: ", op);

m_body = subModel->get_function();
}

void Composite::selectOptimalPrimitiveDescriptor() {
// for the input configution, just always use the parent configuration
VecMemoryDescs inputDescriptors;
for (size_t j = 0; j < getParentEdges().size(); j++) {
inputDescriptors.emplace_back(getParentOutputMemDesc(getParentEdgeAt(0)));
}

std::vector<PortConfig> inConfs;
for (const auto& desc : inputDescriptors) {
inConfs.emplace_back(desc);
}

// configure the inner graph to get the information about output memory descriptors
m_graph.Configure(m_body, context, inputDescriptors, true);

// for the output decriptors, use the configuration of the graph's output nodes
auto outputDescriptors = m_graph.getOutputMemoryDescriptors();

std::vector<PortConfig> outConfs;
for (const auto& desc : outputDescriptors) {
outConfs.emplace_back(desc);
}

const NodeConfig config(inConfs, outConfs);

supportedPrimitiveDescriptors.clear();
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef);

selectPrimitiveDescriptorByIndex(0);
}

// @todo add ascii diagramm for memory mapping / reuse
void Composite::createPrimitive() {
// Point a memory of the inner graph's input edges to the corresponding memory of the node parent edges
OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(),
"Number of node inputs must be equal the number of inner graph's inputs");

for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
const auto input = m_graph.GetInputNodesMap()[i];

for (size_t j = 0; j < input->getChildEdges().size(); j++) {
input->getChildEdgeAt(j)->reuse(getSrcMemoryAtPort(i));
}
}

// Point a memory of the inner graph's output edges to the corresponding memory of the node child edges
// The extra child edges on output ports will be updated after the inference of the inner graph
OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(),
"Number of node inputs must be equal the number of inner graph's inputs");

for (size_t i = 0; i < getOriginalOutputsNumber(); i++) {
const auto output = m_graph.GetOutputNodesMap()[i];
output->getParentEdgeAt(0)->reuse(getDstMemoryAtPort(i));
}

// Allocate inner graph's memory
m_graph.Allocate();
}

void Composite::execute(dnnl::stream) {
m_graph.Infer();

if (!inputShapesModified())
return;

// since the shape inference is not performed for the composite node
// a memory of the extra child edges, attached to the output ports
// has to be updated after an inference of the inner graph finished
for (size_t i = 0; i < getOriginalOutputsNumber(); i++) {
const auto mem = getDstMemoryAtPort(i);
auto& childEdges = getChildEdges();
for (size_t j = getOriginalOutputsNumber(); j < childEdges.size(); j++) {
auto& childEdge = childEdges[j];
auto childEdgePtr = childEdge.lock();
if (childEdgePtr->getInputNum() == static_cast<int>(i)) {
childEdgePtr->getMemoryPtr()->redefineDesc(mem->getDescPtr());
}
}
}
}

void Composite::executeDynamicImpl(dnnl::stream strm) {
execute(strm);
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit 32f76fb

Please sign in to comment.