Skip to content

Commit

Permalink
[Snippets][CPU] Applied Vladislav & Ivan comments 3
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Jan 3, 2025
1 parent 1908740 commit f104183
Show file tree
Hide file tree
Showing 22 changed files with 69 additions and 75 deletions.
10 changes: 5 additions & 5 deletions src/common/snippets/include/snippets/op/load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
};

/**
* @interface LoadReshape
* @interface LoadReorder
* @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
* shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to
* Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
* @ingroup snippets
*/
class LoadReshape : public Load {
class LoadReorder : public Load {
public:
OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
LoadReshape() = default;
OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
LoadReorder() = default;

void set_offset(size_t offset) { set_output_offset(offset, 0); }
void set_count(size_t count) { set_output_count(count, 0); }
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/include/snippets/snippets_isa_tbl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

// SnippetS dialect
OV_OP(Load, ov::snippets::op)
OV_OP(LoadReshape, ov::snippets::op)
OV_OP(LoadReorder, ov::snippets::op)
OV_OP(LoopBegin, ov::snippets::op)
OV_OP(LoopEnd, ov::snippets::op)
OV_OP(Brgemm, ov::snippets::op)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
const auto& node = expr->get_node();
return ov::is_type<ov::snippets::op::Brgemm>(node) ||
ov::is_type<ov::snippets::op::Reshape>(node) ||
ov::is_type<ov::snippets::op::LoadReshape>(node);
ov::is_type<ov::snippets::op::LoadReorder>(node);
}
} // namespace

Expand Down
28 changes: 14 additions & 14 deletions src/common/snippets/src/op/load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
}

LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
: Load(x, count, offset), m_order(std::move(order)) {
const auto& in_shape = x.get_partial_shape();
const auto in_shape_size = in_shape.size();
OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
const std::set<size_t> unique_dims(order.begin(), order.end());
OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
constructor_validate_and_infer_types();
}

void LoadReshape::validate_and_infer_types() {
void LoadReorder::validate_and_infer_types() {
validate_memory_access_params();
const auto& old_shape = get_input_partial_shape(0);
ov::PartialShape new_shape;
Expand All @@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
set_output_type(0, get_input_element_type(0), new_shape);
}

bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
MemoryAccess::visit_attributes(visitor);
visitor.on_attribute("order", m_order);
return true;
}

std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadReshape);
std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadReorder);
check_new_args_count(this, new_args);
return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
}
LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
m_order = loadReshape->m_order;
LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
m_order = loadReorder->m_order;
}
IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
}
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/op/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ void Reorder::custom_constructor_validate_and_infer_types(std::vector<size_t> or

void Reorder::validate_and_infer_types() {
const auto& input_pshape = get_input_partial_shape(0);
const auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
"Incompatible shape and order sizes");
const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order);
Expand Down
4 changes: 2 additions & 2 deletions src/common/snippets/src/pass/transpose_decomposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ TransposeDecomposition::TransposeDecomposition() {
const auto subtensor = std::vector<size_t>{1};
const auto& layout = order->cast_vector<size_t>();

// todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
// todo: LoadReorder used here is essentially Load + an easy way to maintain correct shape propagation
// fix this in future and develop a more consistent shape propagation approach.
auto load = std::make_shared<snippets::op::LoadReshape>(data_input, subtensor[0], 0, layout);
auto load = std::make_shared<snippets::op::LoadReorder>(data_input, subtensor[0], 0, layout);
auto store = std::make_shared<snippets::op::Store>(load, subtensor[0]);

PortDescriptorUtils::set_port_descriptor(load->input(0), subtensor, layout);
Expand Down
21 changes: 12 additions & 9 deletions src/common/snippets/src/runtime_configurator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,18 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param);
ExpressionPtr mem_desc_expr = param;
if (!shape_infer_seq.empty()) {
// If there is Reorder, we should take its desc because it affects on shape by target order
const auto& reordered_reshape_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
[](const ExpressionPtr& expr) {
return ov::is_type<op::Reorder>(expr->get_node());
});
if (reordered_reshape_it != shape_infer_seq.cend()) {
const auto& reshape = *reordered_reshape_it;
const auto& etype = reshape->get_node()->get_output_element_type(0);
update_io_parameters(reshape->get_input_port_descriptor(0), etype);
// [160048] Reorder, as any another ShapeInferOp, should just propagate input shape to output using target order
// without data movement. However, currently we have to save desc of input of the Reorder
// to support correct input data offsets calculations and MHAParallelWAOptimizer pass work.
// Please, remove this code part when the mentioned ticket is completed.
const auto& reorder_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
[](const ExpressionPtr& expr) {
return ov::is_type<op::Reorder>(expr->get_node());
});
if (reorder_it != shape_infer_seq.cend()) {
const auto& reorder = *reorder_it;
const auto& etype = reorder->get_node()->get_output_element_type(0);
update_io_parameters(reorder->get_input_port_descriptor(0), etype);
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
// Note that Result has no output PortConnectors, so the shape must be empty
SHAPE_INFER_PREDEFINED(ov::op::v0::Result, EmptyShapeInfer),
//
SHAPE_INFER_OP_SPECIFIC(op::LoadReshape),
SHAPE_INFER_OP_SPECIFIC(op::LoadReorder),
SHAPE_INFER_OP_SPECIFIC(op::Reshape),
SHAPE_INFER_OP_SPECIFIC(op::Reorder),
SHAPE_INFER_OP_SPECIFIC(op::RankNormalization),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,15 +299,15 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) {
*
* Param0(32,8,1)
* |
* LoadReshape with order (1,2,0)
* LoadReorder with order (1,2,0)
* |
* Store
* |
* Result
*/
{
auto param = linear_ir->push_node<ov::opset10::Parameter>(input_precision, input_shape_0);
auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReshape>(param.second, 1, 0, layout);
auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReorder>(param.second, 1, 0, layout);
auto store = linear_ir->push_node<ov::snippets::op::Store>(load_reshape.second, 1, 0);
init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout});
init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "snippets/lowered/loop_manager.hpp"
#include "snippets/utils/utils.hpp"

#ifndef OPENVINO_ARCH_ARM64
#ifdef OPENVINO_ARCH_X86_64
# include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp"
# include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp"
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {

class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
public:
CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {});
CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache);

/**
* @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ namespace intel_cpu {
struct RepackedInputKernel {
RepackedInputKernel() = default;
virtual ~RepackedInputKernel() = default;
virtual void operator()(const void* args) const = 0;
};

struct RepackedInput {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
jitters[snippets::op::Reorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);

jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
jitters[snippets::op::LoadReshape::get_type_info_static()] =
jitters[snippets::op::LoadReorder::get_type_info_static()] =
CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
jitters[snippets::op::BroadcastLoad::get_type_info_static()] =
CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,11 @@ status_t BrgemmCopyBKernel::create_kernel() {
return code;
}

void BrgemmCopyBKernel::operator()(const call_args* args) const {
void BrgemmCopyBKernel::operator()(const void* args) const {
const auto* call_args = reinterpret_cast<const BrgemmCopyBKernel::call_args*>(args);
OV_CPU_JIT_EMITTER_ASSERT(call_args, "Call arguments are nullptr!");
OV_CPU_JIT_EMITTER_ASSERT(ker_, "Kernel is nullptr");
ker_(args);
ker_(call_args);
}

void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x

dnnl::impl::status_t create_kernel() override;

void operator()(const call_args* args) const;
void operator()(const void* args) const override;

private:
void generate() override;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
OP_EXTENSION(ov::snippets::op::KernelStatic) \
OP_EXTENSION(ov::snippets::op::KernelDynamic) \
OP_EXTENSION(ov::snippets::op::Load) \
OP_EXTENSION(ov::snippets::op::LoadReshape) \
OP_EXTENSION(ov::snippets::op::LoadReorder) \
OP_EXTENSION(ov::snippets::op::LoopBegin) \
OP_EXTENSION(ov::snippets::op::LoopEnd) \
OP_EXTENSION(ov::snippets::op::Buffer) \
Expand Down
15 changes: 4 additions & 11 deletions src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@
//

#include "nodes/executors/subgraph.hpp"
#if defined(OPENVINO_ARCH_ARM64)
# include "emitters/snippets/aarch64/cpu_generator.hpp"
#else
# include "emitters/snippets/x64/cpu_generator.hpp"
#endif

#include "common/primitive_hashing_utils.hpp"
#include "openvino/core/parallel.hpp"

Expand Down Expand Up @@ -102,9 +98,8 @@ void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntime
std::vector<size_t>& domain) {
init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
}
void SubgraphBaseExecutor::parallel_for6d(
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {

void SubgraphBaseExecutor::parallel_for6d(const initializer_functor& initializer, const call_functor& caller) {
const auto& dom = m_parallel_exec_domain;

parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
Expand Down Expand Up @@ -142,9 +137,7 @@ void SubgraphBaseExecutor::parallel_for6d(
});
}

void SubgraphBaseExecutor::parallel_forNd(
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
void SubgraphBaseExecutor::parallel_forNd(const initializer_functor& initializer, const call_functor& caller) {
const auto& dom = m_parallel_exec_domain;

parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
Expand Down
14 changes: 5 additions & 9 deletions src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,17 @@ class SubgraphBaseExecutor {
protected:
virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;

virtual void parallel_for6d(
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
virtual void parallel_forNd(
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;

virtual void parallel_for6d(const initializer_functor& initializer, const call_functor& caller);
virtual void parallel_forNd(const initializer_functor& initializer, const call_functor& caller);

inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const {
if (m_buffer_scratchpad_size > 0)
scratchpad_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + ithr * m_buffer_scratchpad_size;
}

using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;

std::shared_ptr<snippets::Schedule> m_schedule;
// Holds index of output used as in execution domain
// it should be compatible with a schedule's work size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,13 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() {
transformation_callback(copy_b_node))
return false;

// If there is non-empty and non-planar layout, we should insert reshape to support shape inference
// If there is non-planar layout, we should insert reshape to support shape inference
if (!ov::snippets::utils::is_planar_layout(layout)) {
const auto& subtensor = in_desc->get_subtensor();
const auto& reshape = std::make_shared<ov::snippets::op::Reorder>(copy_b_node->input_value(0), layout);
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout);
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor);
ov::replace_node_update_name(copy_b_node, reshape);
return true;
return ov::replace_node_update_name(copy_b_node, reshape);
}

// If there is no layout, we can just remove BrgemmCopyB from the subgraph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,12 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
update_kernel(p.second, shape, layout, N, K, prc);
}

const auto L2_cache_size = dnnl::utils::get_cache_size(2, true);
const auto fit_into_L2 = data_size < L2_cache_size;
// Heuristic: If external repacking data doesn't fit in the cache L2,
const auto cache_size = dnnl::utils::get_cache_size(1, true) + dnnl::utils::get_cache_size(2, true);
const auto fit_into_cache = data_size < cache_size;
// Heuristic: If external repacking data doesn't fit in the caches L1 and L2,
// external repacking should be executed in seperate parallel section before kernel execution.
cpu_config->repacking_impl_type =
fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL : CPURuntimeConfig::RepackingImplType::SEPARATE;
cpu_config->repacking_impl_type = fit_into_cache ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL
: CPURuntimeConfig::RepackingImplType::SEPARATE;

const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL;

Expand All @@ -131,7 +131,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
// Save original input offsets for input before repacking.
// If the shape has not been changed, it means that we already created `RepackedInput` for this input
// on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input -
// they were updated for blocked shapes/zeroed for previous initialization and we canonot use them as original
// they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original
// offsets.
const auto in_offsets =
shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i];
Expand Down
Loading

0 comments on commit f104183

Please sign in to comment.