Skip to content

Commit

Permalink
ReduceDecomposition moved to backend specific transformations
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Jan 25, 2024
1 parent cd881a2 commit c09c7b8
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 176 deletions.
2 changes: 1 addition & 1 deletion src/common/snippets/src/lowered/pass/assign_registers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
manually_assigned_gprs[expr->get_output_port_connector(0)] =
static_cast<Reg>(num_results + num_parameters + buffer_id);
} else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
// Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer.
// Only decomposed Reduce ops use HorizonMax/HorizonSum and VectorBuffer.
// We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
// TODO [96351]: We should rewrite accumulator pattern using another way
const auto& input_tensor = expr->get_input_port_connector(0);
Expand Down
2 changes: 0 additions & 2 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
#include "snippets/lowered/pass/insert_perf_count.hpp"
#include "snippets/lowered/pass/validate_shapes.hpp"
#include "snippets/lowered/pass/pass_config.hpp"
#include "snippets/lowered/pass/reduce_decomposition.hpp"

#include "transformations/utils/utils.hpp"

Expand Down Expand Up @@ -424,7 +423,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,

lowered::pass::PassPipeline pipeline(lowered_pass_config);
pipeline.register_pass<lowered::pass::MarkLoops>(vector_size);
pipeline.register_pass<lowered::pass::ReduceDecomposition>(vector_size);
pipeline.register_pass<lowered::pass::FuseLoops>();
pipeline.register_pass<lowered::pass::SplitLoops>();
pipeline.register_pass<lowered::pass::MoveResultOutOfLoop>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,6 @@ class EltwiseBufferAllocationTest : public BufferAllocationTest {
std::shared_ptr<ov::Model> GetModel() const override;
};

class MHABufferAllocationTest : public BufferAllocationTest {
protected:
std::shared_ptr<ov::Model> GetModel() const override;

static void MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor);
};

} // namespace snippets
} // namespace test
} // namespace ov
90 changes: 0 additions & 90 deletions src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "snippets/lowered/pass/fuse_loops.hpp"
#include "snippets/lowered/pass/split_loops.hpp"
#include "snippets/lowered/pass/insert_buffers.hpp"
#include "snippets/lowered/pass/reduce_decomposition.hpp"

#include "common_test_utils/common_utils.hpp"

Expand Down Expand Up @@ -69,7 +68,6 @@ void BufferAllocationTest::MarkOp(const std::shared_ptr<ov::Node>& node, const s
void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippets::lowered::pass::PassConfig>& pass_config) {
ov::snippets::lowered::pass::PassPipeline pipeline(pass_config);
pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::ReduceDecomposition>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
Expand Down Expand Up @@ -114,65 +112,9 @@ std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
return body;
}

void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor) {
const auto subtensor_full = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
node->input(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(0), subtensor));
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
node->input(1), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(1), subtensor_full));
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
node->output(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->output(0), subtensor));
}

std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
const auto subtensor_scalar = std::vector<size_t>{1};
const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};

const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));

const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
const auto matmul0 = std::make_shared<ov::snippets::op::Brgemm>(parameter0, relu0);
const auto relu1 = std::make_shared<ov::op::v0::Relu>(matmul0);

// Decomposed Softmax
const auto reduce_max = std::make_shared<ov::snippets::op::ReduceMax>(relu1, 3);
const auto subtract = std::make_shared<ov::op::v1::Subtract>(relu1, reduce_max);
const auto exp = std::make_shared<ov::op::v0::Exp>(subtract);

const auto reduce_sum = std::make_shared<ov::snippets::op::ReduceSum>(exp, 3);
const auto power = std::make_shared<ov::snippets::op::PowerStatic>(reduce_sum, -1.f);
const auto multiply = std::make_shared<ov::op::v1::Multiply>(exp, power);

const auto matmul1 = std::make_shared<ov::snippets::op::Brgemm>(multiply, parameter2);
const auto relu2 = std::make_shared<ov::op::v0::Relu>(matmul1);

const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});

MarkOp(load_reshape, subtensor_scalar);
MarkOp(store, subtensor_scalar);
MarkOp(reduce_max, subtensor_softmax);
MarkOp(reduce_sum, subtensor_softmax);
MarkOp(power, subtensor_softmax);

MarkBrgemm(matmul0, subtensor_brgemm);
MarkBrgemm(matmul1, subtensor_brgemm);

return body;
}

TEST_P(EltwiseBufferAllocationTest, BufferAllocation) {
Validate();
}
TEST_P(MHABufferAllocationTest, BufferAllocation) {
Validate();
}

namespace BufferAllocationTest_Instances {

Expand All @@ -192,38 +134,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, Eltwi
::testing::Values(1)), // Two Buffers reuse IDs
BufferAllocationTest::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(true),
::testing::Values(139264), // Each Buffer has own allocated memory
::testing::Values(7)), // Each Buffer has unique ID
BufferAllocationTest::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(true),
::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
BufferAllocationTest::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(false),
::testing::Values(360448), // Each Buffer has own allocated memory
::testing::Values(7)), // Each Buffer has unique ID
BufferAllocationTest::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(false),
::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
BufferAllocationTest::getTestCaseName);

} // namespace BufferAllocationTest_Instances
} // namespace snippets
} // namespace test
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
#include "snippets/lowered/pass/optimize_domain.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/mark_loops.hpp"
#include "snippets/lowered/pass/fuse_loops.hpp"
#include "transformations/defs.hpp"
#include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp"
#include "transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp"
#include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"
#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp"
#include "transformations/snippets/x64/pass/lowered/reduce_decomposition.hpp"
#include "transformations/snippets/x64/pass/mul_add_to_fma.hpp"
#include "transformations/snippets/x64/pass/remove_converts.hpp"
#include "transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.hpp"
Expand Down Expand Up @@ -353,6 +355,7 @@ void Snippet::initOptimalPrimitiveDescriptor() {
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::pass::MatMulToBrgemm,
pass::EnforcePrecision, element::f32, element::bf16);
}

SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::snippets::pass::PropagatePrecision,
ov::intel_cpu::pass::BrgemmToBrgemmCPU);
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU,
Expand Down Expand Up @@ -633,6 +636,9 @@ void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp)
# define SNIPPETS_REGISTER_PASS_RELATIVE(PASS_PLACE, TARGET_PASS, PASS, ...)
#endif // OPENVINO_ARCH_X86_64

const size_t vector_size = snippetAttrs.snippet->get_generator()->get_target_machine()->get_lanes();
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::FuseLoops,
ov::intel_cpu::pass::ReduceDecomposition, vector_size);
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::MarkLoops,
ov::intel_cpu::pass::BrgemmBlocking);
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::InsertLoops,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/lowered/pass/reduce_decomposition.hpp"
#include "reduce_decomposition.hpp"

#include "snippets/itt.hpp"
#include "snippets/lowered/linear_ir.hpp"
Expand All @@ -11,15 +11,19 @@
#include "snippets/snippets_isa.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace intel_cpu {
namespace pass {

using LinearIR = snippets::lowered::LinearIR;
using HandlerType = LinearIR::LoopManager::LoopInfo::SpecificIterationHandlers::HandlerType;
using namespace ov::snippets::lowered;


namespace {
uint32_t get_initial_value(const ov::DiscreteTypeInfo& type_info) {
static const std::map<ov::DiscreteTypeInfo, uint32_t> reduce_initial_values {
{op::ReduceMax::get_type_info_static(), uint32_t(0xff7fffff)},
{op::ReduceSum::get_type_info_static(), uint32_t(0x00000000)},
{ov::snippets::op::ReduceMax::get_type_info_static(), uint32_t(0xff7fffff)},
{ov::snippets::op::ReduceSum::get_type_info_static(), uint32_t(0x00000000)},
};
OPENVINO_ASSERT(reduce_initial_values.count(type_info), "Unexpected ReduceType");
return reduce_initial_values.at(type_info);
Expand All @@ -28,36 +32,33 @@ uint32_t get_initial_value(const ov::DiscreteTypeInfo& type_info) {
std::shared_ptr<ov::Node> get_accumulation_node(const ov::Output<ov::Node>& input0,
const ov::Output<ov::Node>& input1,
const ov::DiscreteTypeInfo& type_info) {
if (type_info == op::ReduceMax::get_type_info_static()) {
if (type_info == ov::snippets::op::ReduceMax::get_type_info_static()) {
return std::make_shared<ov::op::v1::Maximum>(input0, input1);
} else if (type_info == op::ReduceSum::get_type_info_static()) {
} else if (type_info == ov::snippets::op::ReduceSum::get_type_info_static()) {
return std::make_shared<ov::op::v1::Add>(input0, input1);
} else {
OPENVINO_THROW("Unsupported reduce type: ", type_info);
}
}

std::shared_ptr<ov::Node> get_horizon_node(const ov::Output<ov::Node>& input, const ov::DiscreteTypeInfo& type_info) {
if (type_info == op::ReduceMax::get_type_info_static()) {
return std::make_shared<op::HorizonMax>(input);
} else if (type_info == op::ReduceSum::get_type_info_static()) {
return std::make_shared<op::HorizonSum>(input);
if (type_info == ov::snippets::op::ReduceMax::get_type_info_static()) {
return std::make_shared<ov::snippets::op::HorizonMax>(input);
} else if (type_info == ov::snippets::op::ReduceSum::get_type_info_static()) {
return std::make_shared<ov::snippets::op::HorizonSum>(input);
} else {
OPENVINO_THROW("Unsupported reduce type: ", type_info);
}
}
} // namespace

using LoopInfo = LinearIR::LoopManager::LoopInfo;
using HandlerType = LoopInfo::SpecificIterationHandlers::HandlerType;

ReduceDecomposition::ReduceDecomposition(size_t vector_size) : m_vector_size{vector_size} {}

bool ReduceDecomposition::run(LinearIR& linear_ir) {
bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ReduceMaxDecompositionLowered")
const auto& loop_manager = linear_ir.get_loop_manager();
bool modified = false;
for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
for (auto expr_it = begin; expr_it != end; expr_it++) {
const auto& reduce_expr = *expr_it;
const auto& reduce = ov::as_type_ptr<ov::snippets::op::ReduceBase>(reduce_expr->get_node());
if (!reduce)
Expand All @@ -81,11 +82,11 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
const auto fill_value = get_initial_value(reduce_type_info);
// Note: VectorBuffer is a special case, since it should go before the initial Load.
// The buffer must be initialized with fill_value before reduction
const auto vector_buffer = push_node(std::make_shared<op::VectorBuffer>());
const auto initial_fill = push_node(std::make_shared<op::Fill>(vector_buffer.second, 0, fill_value));
const auto vector_buffer = push_node(std::make_shared<ov::snippets::op::VectorBuffer>());
const auto initial_fill = push_node(std::make_shared<ov::snippets::op::Fill>(vector_buffer.second, 0, fill_value));

// Reduce loop
const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), increment, fill_value));
const auto fill = push_node(std::make_shared<ov::snippets::op::Fill>(reduce->get_input_source_output(0), increment, fill_value));
const auto accumulation = push_node(get_accumulation_node(fill.second, initial_fill.second, reduce_type_info));

const auto reduce_loop_id = loop_manager->mark_loop(
Expand All @@ -98,7 +99,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
std::vector<ExpressionPort>{(*accumulation.first)->get_output_port(0)});
const auto tail_size = work_amount % increment;
if (tail_size != 0) {
loop_manager->get_loop_info(reduce_loop_id)->register_handler<HandlerType::LAST_ITER, SetFillOffset>(tail_size);
loop_manager->get_loop_info(reduce_loop_id)->register_handler<HandlerType::LAST_ITER, ov::snippets::lowered::pass::SetFillOffset>(tail_size);
}
const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info));

Expand All @@ -124,7 +125,6 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
return modified;
}

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
} // namespace pass
} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@

#pragma once

#include "pass.hpp"
#include "snippets/lowered/pass/pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace intel_cpu {
namespace pass {

/**
Expand All @@ -17,17 +16,18 @@ namespace pass {
* @attention Only Reduce by last dimension is supported
* @ingroup snippets
*/
class ReduceDecomposition : public Pass {
class ReduceDecomposition : public snippets::lowered::pass::RangedPass {
public:
OPENVINO_RTTI("ReduceDecomposition", "Pass")
explicit ReduceDecomposition(size_t vector_size);
bool run(LinearIR& linear_ir) override;
bool run(snippets::lowered::LinearIR& linear_ir,
snippets::lowered::LinearIR::constExprIt begin,
snippets::lowered::LinearIR::constExprIt end) override;

private:
size_t m_vector_size;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
} // namespace pass
} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit c09c7b8

Please sign in to comment.