Skip to content

Commit

Permalink
[CPU] [ARM] FullyConnected: performance measurement workarounds: eltw…
Browse files Browse the repository at this point in the history
…ise workarounds: Round + Convert
  • Loading branch information
eshoguli committed Aug 9, 2024
1 parent 563ad0b commit 9260157
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 46 deletions.
4 changes: 4 additions & 0 deletions src/plugins/intel_cpu/src/cpu_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,11 @@ static const TypeToNameMap& get_type_to_name_tbl() {
{"Loop", Type::TensorIterator},
{"ReadValue", Type::MemoryInput}, // for construction from name ctor, arbitrary name is used
{"Assign", Type::MemoryOutput}, // for construction from layer ctor
#ifdef OPENVINO_ARCH_ARM64
{"Convert", Type::Eltwise},
#else
{"Convert", Type::Convert},
#endif
{"NV12toRGB", Type::ColorConvert},
{"NV12toBGR", Type::ColorConvert},
{"I420toRGB", Type::ColorConvert},
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/cpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ enum class Algorithm {
EltwiseIsNaN,
EltwiseMultiply,
EltwiseSubtract,
EltwiseConvert,
EltwiseDivide,
EltwiseFloor,
EltwiseFloorMod,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,43 @@ std::set<std::vector<element::Type>> jit_clamp_emitter::get_supported_precisions
return {{element::f32}};
}

/// CONVERT ///
jit_convert_emitter::jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& node)
: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {
}

jit_convert_emitter::jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const ov::element::Type exec_prc)
: jit_emitter(host, host_isa, exec_prc) {
}

size_t jit_convert_emitter::get_inputs_count() const { return 1; }

std::set<std::vector<element::Type>> jit_convert_emitter::get_supported_precisions(const std::shared_ptr<ov::Node>& node) {
return {{element::u8}, {element::i8}, {element::i32}, {element::i64}, {element::f16}, {element::f32}};
}

void jit_convert_emitter::emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const {
if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) {
emit_isa<dnnl::impl::cpu::aarch64::asimd>(in_vec_idxs, out_vec_idxs);
} else {
OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel");
}
}

template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
void jit_convert_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
//OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string());

using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
TReg src = TReg(in_vec_idxs[0]);
TReg dst = TReg(out_vec_idxs[0]);
h->mov(dst.b16, src.b16);
}

/// DIVIDE ///
jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
Expand Down Expand Up @@ -1567,6 +1604,43 @@ void jit_relu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const st
h->fmaxnm(dst.s, src.s, tmp.s);
}

/// ROUND ///
jit_round_emitter::jit_round_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& node)
: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {
}

jit_round_emitter::jit_round_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const ov::element::Type exec_prc)
: jit_emitter(host, host_isa, exec_prc) {
}

size_t jit_round_emitter::get_inputs_count() const { return 1; }

std::set<std::vector<element::Type>> jit_round_emitter::get_supported_precisions(const std::shared_ptr<ov::Node>& node) {
return {{element::f32}};
}

void jit_round_emitter::emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const {
if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) {
emit_isa<dnnl::impl::cpu::aarch64::asimd>(in_vec_idxs, out_vec_idxs);
} else {
OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel");
}
}

template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
void jit_round_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string());

using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
TReg src = TReg(in_vec_idxs[0]);
TReg dst = TReg(out_vec_idxs[0]);
h->mov(dst.b16, src.b16);
}

/// SELECT ///
jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,27 @@ class jit_clamp_emitter : public jit_emitter {
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};

class jit_convert_emitter : public jit_emitter {
public:
jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const ov::element::Type exec_prc = ov::element::f32);

jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& node);

size_t get_inputs_count() const override;

static std::set<std::vector<element::Type>> get_supported_precisions(const std::shared_ptr<ov::Node>& node = nullptr);

private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const override;

template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};

class jit_divide_emitter : public jit_emitter {
public:
jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host,
Expand Down Expand Up @@ -612,6 +633,27 @@ class jit_relu_emitter : public jit_emitter {
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};

class jit_round_emitter : public jit_emitter {
public:
jit_round_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const ov::element::Type exec_prc = ov::element::f32);

jit_round_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& node);

size_t get_inputs_count() const override;

static std::set<std::vector<element::Type>> get_supported_precisions(const std::shared_ptr<ov::Node>& node = nullptr);

private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const override;

template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};

class jit_select_emitter : public jit_emitter {
public:
jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host,
Expand Down
10 changes: 10 additions & 0 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "nodes/common/cpu_convert.h"
#include "nodes/common/cpu_memcpy.h"
#include "nodes/convert.h"
#include "nodes/eltwise.h"
#include "nodes/input.h"
#include "nodes/reorder.h"
#include "nodes/memory.hpp"
Expand Down Expand Up @@ -573,6 +574,15 @@ void Graph::ResolveEdgeConflicts() {

for (ptrdiff_t i = 0; i < numberOfEdges; i++) {
auto edge = graphEdges[i];

// TODO: debug only
if (edge->getChild()->getType() == Type::Eltwise) {
const auto eltwise = std::dynamic_pointer_cast<node::Eltwise>(edge->getChild());
if ((eltwise != nullptr) && (eltwise->getAlgorithm() == Algorithm::EltwiseConvert)) {
continue;
}
}

auto reorderStatus = graphEdges[i]->needReorder();
DEBUG_LOG(graphEdges[i]->name(), " reorderStatus = ", reorderStatus);
if (reorderStatus == Edge::ReorderStatus::Regular) {
Expand Down
23 changes: 23 additions & 0 deletions src/plugins/intel_cpu/src/nodes/eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,9 @@ const std::map<const ov::DiscreteTypeInfo, Eltwise::Initializer>& Eltwise::getIn
node.algorithm = Algorithm::EltwiseMultiply;
node.broadcastingPolicy = determineBroadcastingPolicy(op);
}},
{ov::op::v0::Convert::get_type_info_static(), [](const std::shared_ptr<ov::Node>& op, Eltwise& node) {
node.algorithm = Algorithm::EltwiseConvert;
}},
{ov::op::v1::Divide::get_type_info_static(), [](const std::shared_ptr<ov::Node>& op, Eltwise& node) {
node.algorithm = Algorithm::EltwiseDivide;
node.broadcastingPolicy = determineBroadcastingPolicy(op);
Expand Down Expand Up @@ -2063,6 +2066,10 @@ bool Eltwise::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, st

Eltwise::Eltwise(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr context) :
Node(op, context, EltwiseShapeInferFactory()), broadcastingPolicy(Undefined) {
if (as_type_ptr<opset1::Convert>(op)) {
std::cout << "Eltwise::Eltwise" << std::endl;
}

std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage);
Expand All @@ -2072,6 +2079,7 @@ Eltwise::Eltwise(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr c

size_t Eltwise::getOpInputsNum() const {
switch (getAlgorithm()) {
case Algorithm::EltwiseConvert:
case Algorithm::EltwiseIsFinite:
case Algorithm::EltwiseIsInf:
case Algorithm::EltwiseIsNaN:
Expand Down Expand Up @@ -2153,6 +2161,10 @@ void Eltwise::getSupportedDescriptors() {
}

void Eltwise::initSupportedPrimitiveDescriptors() {
if (getAlgorithm() == Algorithm::EltwiseConvert) {
std::cout << "Eltwise::initSupportedPrimitiveDescriptors" << std::endl;
}

const auto isBitwise = [](const Algorithm& algorithm) {
return one_of(
algorithm,
Expand Down Expand Up @@ -2547,6 +2559,10 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
}

void Eltwise::createPrimitive() {
if (getAlgorithm() == Algorithm::EltwiseConvert) {
std::cout << "Eltwise::createPrimitive" << std::endl;
}

if (memPtrs.empty()) {
for (size_t i = 0; i < inputNum; i++)
memPtrs.push_back(getSrcMemoryAtPort(i));
Expand All @@ -2570,6 +2586,10 @@ void Eltwise::createPrimitive() {
}

void Eltwise::prepareParams() {
if (getAlgorithm() == Algorithm::EltwiseConvert) {
std::cout << "Eltwise::prepareParams" << std::endl;
}

if (canUseAclExecutor) {
std::vector<MemoryDescPtr> srcMemoryDescs;
for (size_t i = 0; i < getParentEdges().size(); i++) {
Expand Down Expand Up @@ -3029,6 +3049,9 @@ bool Eltwise::canFuseParent(const NodePtr& parentNode) const {
}

bool Eltwise::canFuse(const NodePtr& node) const {
if (getAlgorithm() == Algorithm::EltwiseConvert) {
std::cout << "Eltwise::canFuse" << std::endl;
}
auto isIntegerComputeSupported = [](const Node* node) {
if (!one_of(node->getAlgorithm(), Algorithm::EltwiseAdd,
Algorithm::EltwiseMultiply,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ bool JitEltwiseExecutor::isSupported(
const float alpha,
const float beta,
const float gamma) {
if (algorithm == Algorithm::EltwiseConvert) {
return true;
}
const auto is_supported = one_of(algorithm,
Algorithm::EltwiseAbs,
Algorithm::EltwiseAdd,
Algorithm::EltwiseClamp,
Algorithm::EltwiseConvert,
Algorithm::EltwiseDivide,
Algorithm::EltwiseElu,
Algorithm::EltwiseEqual,
Expand All @@ -41,6 +45,8 @@ bool JitEltwiseExecutor::isSupported(
Algorithm::EltwisePowerStatic,
Algorithm::EltwisePrelu,
Algorithm::EltwiseRelu,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseSelect,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseSubtract,
Expand Down
Loading

0 comments on commit 9260157

Please sign in to comment.