Skip to content

Commit

Permalink
Performance fixes (openvinotoolkit#116)
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitry-gorokhov committed Apr 5, 2023
1 parent e3afabc commit a4c7e14
Show file tree
Hide file tree
Showing 33 changed files with 3,607 additions and 2,943 deletions.
31 changes: 31 additions & 0 deletions src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,5 +246,36 @@ dnnl::algorithm DnnlExtensionUtils::convertToDnnlAlgorithm(Algorithm alg) {
}
}

bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) {
#if defined(OV_CPU_WITH_ACL)
return one_of(alg, Algorithm::EltwiseRelu,
Algorithm::EltwiseTanh,
Algorithm::EltwiseElu,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu,
Algorithm::EltwiseSigmoid);
#elif defined(OPENVINO_ARCH_X86_64)
return one_of(alg, Algorithm::EltwiseRelu,
Algorithm::EltwiseGeluErf,
Algorithm::EltwiseGeluTanh,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu);
#else
return false;
#endif
}

} // namespace intel_cpu
} // namespace ov
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/dnnl_extension_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class DnnlExtensionUtils {
static dnnl_memory_desc_t clone_desc(const_dnnl_memory_desc_t cdesc);
static const char* query_pd_info(const_dnnl_primitive_desc_t pd);
static dnnl::algorithm convertToDnnlAlgorithm(Algorithm alg);
static bool isUnarySupportedAsPostOp(Algorithm alg);
};

} // namespace intel_cpu
Expand Down
29 changes: 2 additions & 27 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -988,22 +988,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap
continue;
}

if (!one_of(fuseCandidate->getAlgorithm(), Algorithm::EltwiseRelu,
Algorithm::EltwiseGeluErf,
Algorithm::EltwiseGeluTanh,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu)) {
if (!DnnlExtensionUtils::isUnarySupportedAsPostOp(fuseCandidate->getAlgorithm())) {
parent++;
continue;
}
Expand Down Expand Up @@ -1176,17 +1161,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph)

auto isFusingSupported = [&](NodePtr conv, NodePtr child) {
return child->getType() == Type::Eltwise &&
one_of(child->getAlgorithm(), Algorithm::EltwiseRelu,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseSoftRelu);
DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm());
};

for (auto &graphNode : graphNodes) {
Expand Down
27 changes: 10 additions & 17 deletions src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,9 @@ void Node::cleanup() {
const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
std::vector<impl_desc_type> priorities = {
impl_desc_type::unknown,
// Undef impl type is used to express use-cases there real type is unkown during compilation
// Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties
impl_desc_type::undef,
impl_desc_type::brgconv_avx512_amx_1x1,
impl_desc_type::brgconv_avx512_amx,
impl_desc_type::jit_avx512_amx_dw,
Expand Down Expand Up @@ -989,6 +992,7 @@ const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
impl_desc_type::gemm_avx2,
impl_desc_type::gemm_avx,
impl_desc_type::gemm_sse42,
impl_desc_type::acl,
impl_desc_type::jit_gemm,
impl_desc_type::ref_any,
impl_desc_type::ref,
Expand Down Expand Up @@ -1341,6 +1345,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ngraph::Node>& op, const
}

bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
#if defined(OPENVINO_ARCH_X86_64)
IE_ASSERT(parentNode);

size_t fusingPort = 0;
Expand Down Expand Up @@ -1391,6 +1396,10 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
Algorithm::EltwisePrelu,
Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput())
|| isConvertablePowerStatic();
#else
// TODO: provide correct list of operations for other backends
return false;
#endif
}

// @todo shifts for Subtract and scales for Divide are replaced with
Expand Down Expand Up @@ -1607,23 +1616,7 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const {
}
return ret;
} else if (node->getType() == Type::Eltwise) {
return one_of(node->getAlgorithm(),
Algorithm::EltwiseRelu,
Algorithm::EltwiseGeluErf,
Algorithm::EltwiseGeluTanh,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu) ||
return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) ||
node->canBePerformedAsScaleShift(this);
}
return false;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/nodes/conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ InferenceEngine::Precision Convolution::fusedEltwisePrecision(const NodePtr& fus
const std::vector<impl_desc_type>& Convolution::getPrimitivesPriority() {
std::vector<impl_desc_type> priorities = {
impl_desc_type::unknown,
impl_desc_type::dw_acl,
impl_desc_type::winograd_acl,
impl_desc_type::gemm_acl,
impl_desc_type::brgconv_avx512_amx_1x1,
Expand Down
65 changes: 40 additions & 25 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,7 @@ namespace intel_cpu {

using namespace arm_compute;

TensorShape eltwiseShapeCast(const VectorDims &dims) {
arm_compute::TensorShape tensorShape;
for (std::size_t i = 0; i < dims.size(); ++i) {
tensorShape.set(dims.size() - i - 1, dims[i], false);
}
if (tensorShape.num_dimensions() == 0) {
tensorShape.set(0, 1, false);
tensorShape.set_num_dimensions(1);
}
return tensorShape;
}

VectorDims reshape_sizes(VectorDims dims) {
inline VectorDims reshape_sizes(VectorDims dims) {
const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
VectorDims result_dims(MAX_NUM_SHAPE - 1);
if (dims.size() >= MAX_NUM_SHAPE) {
Expand All @@ -46,29 +34,56 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
if (!postOps.empty()) { return false; }
aclEltwiseAttrs = eltwiseAttrs;

std::vector<arm_compute::TensorShape> srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size());
std::vector<arm_compute::DataLayout> srcDataLayout(srcDescs.size()), dstDataLayout(dstDescs.size());
std::vector<arm_compute::TensorInfo> srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size());
srcTensors = std::vector<arm_compute::Tensor>(srcDescs.size());
dstTensors = std::vector<arm_compute::Tensor>(dstDescs.size());

for (int i = 0; i < srcVecDims.size(); i++) {
srcVecDims[i] = shapeCast(reshape_sizes(srcDescs[i]->getShape().getDims()));
}
for (int i = 0; i < dstVecDims.size(); i++) {
dstVecDims[i] = shapeCast(reshape_sizes(dstDescs[i]->getShape().getDims()));
}

for (int i = 0; i < srcDescs.size(); i++) {
srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]);
}
for (int i = 0; i < dstDescs.size(); i++) {
dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]);
}

if (srcDescs.size() == 2 &&
srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) {
return false;
auto dim_size = srcDescs[0]->getShape().getDims().size();
auto mover = [&dim_size](TensorShape &_shape) {
if (dim_size == 5) { std::swap(_shape[2], _shape[3]); }
std::swap(_shape[1], _shape[2]);
std::swap(_shape[0], _shape[1]);
};
if (dim_size < 5) {
srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW;
} else {
srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCDHW;
}
mover(srcVecDims[0]);
mover(srcVecDims[1]);
mover(dstVecDims[0]);
}

std::vector<VectorDims> srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size());
std::vector<TensorInfo> srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size());
srcTensors = std::vector<arm_compute::Tensor>(srcDescs.size());
dstTensors = std::vector<arm_compute::Tensor>(dstDescs.size());

for (int i = 0; i < srcVecDims.size(); i++) {
srcVecDims[i] = reshape_sizes(srcDescs[i]->getShape().getDims());
srcTensorsInfo[i] = TensorInfo(eltwiseShapeCast(srcVecDims[i]), 1,
srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1,
precisionToAclDataType(srcDescs[i]->getPrecision()),
getAclDataLayoutByMemoryDesc(srcDescs[i]));
srcDataLayout[i]);
srcTensors[i].allocator()->init(srcTensorsInfo[i]);
}

for (int i = 0; i < dstVecDims.size(); i++) {
dstVecDims[i] = reshape_sizes(dstDescs[i]->getShape().getDims());
dstTensorsInfo[i] = TensorInfo(eltwiseShapeCast(dstVecDims[i]), 1,
dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1,
precisionToAclDataType(dstDescs[i]->getPrecision()),
getAclDataLayoutByMemoryDesc(dstDescs[i]));
dstDataLayout[i]);
dstTensors[i].allocator()->init(dstTensorsInfo[i]);
}

Expand Down
2 changes: 0 additions & 2 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
namespace ov {
namespace intel_cpu {

arm_compute::TensorShape eltwiseShapeCast(const VectorDims& dims);

class AclEltwiseExecutor : public EltwiseExecutor {
public:
AclEltwiseExecutor(const ExecutorContext::CPtr context);
Expand Down
Loading

0 comments on commit a4c7e14

Please sign in to comment.