Skip to content

Commit

Permalink
Changes for MatMul
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Nov 30, 2022
1 parent 60d3ea5 commit b6fe09b
Show file tree
Hide file tree
Showing 20 changed files with 540 additions and 107 deletions.
15 changes: 13 additions & 2 deletions src/common/snippets/include/snippets/op/matmul_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace op {
class MatMulCPU : public ngraph::op::v0::MatMul {
public:
OPENVINO_OP("MatMulCPU", "SnippetsOpset", ngraph::op::v0::MatMul);
MatMulCPU(const Output<Node>& A, const Output<Node>& B);
MatMulCPU(const Output<Node>& A, const Output<Node>& B, size_t offset_a = 0, size_t offset_b = 0, size_t offset_c = 0);
MatMulCPU() = default;

bool visit_attributes(AttributeVisitor& visitor) override;
Expand All @@ -29,9 +29,20 @@ class MatMulCPU : public ngraph::op::v0::MatMul {

bool has_evaluate() const override { return false; }

size_t get_offset_a() const { return m_offset_a; }
size_t get_offset_b() const { return m_offset_b; }
size_t get_offset_c() const { return m_offset_c; }

void set_offset_a(size_t offset) { m_offset_a = offset; }
void set_offset_b(size_t offset) { m_offset_b = offset; }
void set_offset_c(size_t offset) { m_offset_c = offset; }

private:
MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout);
MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout, size_t offset_a = 0, size_t offset_b = 0, size_t offset_c = 0);
std::vector<size_t> m_output_layout;
size_t m_offset_a = 0lu;
size_t m_offset_b = 0lu;
size_t m_offset_c = 0lu;
};

} // namespace op
Expand Down
10 changes: 10 additions & 0 deletions src/common/snippets/src/op/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ void snippets::op::Buffer::set_offset(const size_t offset) {
}
if (auto store = std::dynamic_pointer_cast<snippets::op::Store>(parent)) {
store->set_offset(m_offset);
} else if (auto matmul = std::dynamic_pointer_cast<snippets::op::MatMulCPU>(parent)) {
// MatMul encapsulates work with Loops inside himself
matmul->set_offset_c(m_offset);
} else {
throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation");
}
Expand All @@ -73,6 +76,13 @@ void snippets::op::Buffer::set_offset(const size_t offset) {
}
} else if (const auto load = std::dynamic_pointer_cast<snippets::op::Load>(child)) {
load->set_offset(m_offset);
} else if (auto matmul = std::dynamic_pointer_cast<snippets::op::MatMulCPU>(child)) {
// MatMul encapsulates work with Loops inside himself
if (target_input.get_index() == 0) {
matmul->set_offset_a(m_offset);
} else {
matmul->set_offset_b(m_offset);
}
} else {
throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation");
}
Expand Down
16 changes: 8 additions & 8 deletions src/common/snippets/src/op/matmul_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,25 @@ namespace ngraph {
namespace snippets {
namespace op {

MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B) : MatMul(), m_output_layout({}) {
MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B, size_t offset_a, size_t offset_b, size_t offset_c)
: MatMul(), m_output_layout({}), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
set_arguments({A, B});
set_output_size(1);
constructor_validate_and_infer_types();
}

MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout)
: MatMul(), m_output_layout(std::move(output_layout)) {
MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout, size_t offset_a, size_t offset_b, size_t offset_c)
: MatMul(), m_output_layout(std::move(output_layout)), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
set_arguments({A, B});
set_output_size(1);
constructor_validate_and_infer_types();
}

bool MatMulCPU::visit_attributes(AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(MatMulCPU_visit_attributes);
visitor.on_attribute("offset_a", m_offset_a);
visitor.on_attribute("offset_b", m_offset_b);
visitor.on_attribute("offset_c", m_offset_c);
// todo: should we visit planar shapes?
//visitor.on_attribute("leading_dimensions", m_leading_dimensions);
return true;
Expand Down Expand Up @@ -81,11 +85,7 @@ void MatMulCPU::validate_and_infer_types() {
std::shared_ptr<Node> MatMulCPU::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(MatMulCPU_clone_with_new_inputs);
check_new_args_count(this, new_args);
// auto new_matmul = std::make_shared<MatMulCPU>(new_args.at(0), new_args.at(1));
return std::shared_ptr<Node>(new MatMulCPU(new_args.at(0), new_args.at(1), m_output_layout));
// new_matmul->output_layout = output_layout;
// return new_matmul;
// return std::make_shared<MatMulCPU>(new_args.at(0), new_args.at(1));
return std::shared_ptr<Node>(new MatMulCPU(new_args.at(0), new_args.at(1), m_output_layout, m_offset_a, m_offset_b, m_offset_c));
}

} // namespace op
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ FuseTransposeMatMulCPU::FuseTransposeMatMulCPU() {
auto matmul_out0 = pattern::wrap_type<opset1::Transpose>({matmul_any, constant});
auto matmul_or_transpose = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{matmul_in0, matmul_in1, matmul_out0});

auto callback = [](pattern::Matcher& m) {
auto callback = [&transpose_is_supported](pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseTransposeMatMulCPU")
auto set_layout_from_order = [](const std::shared_ptr<opset1::Transpose>& node, const ov::Output<Node>& port) {
const auto& const_order = as_type_ptr<opset1::Constant>(node->get_input_node_shared_ptr(1));
Expand All @@ -72,8 +72,10 @@ FuseTransposeMatMulCPU::FuseTransposeMatMulCPU() {
for (int i = 0; i < matmul->get_input_size(); i++) {
const auto& in_value = matmul->input_value(i);
if (const auto& transpose = as_type_ptr<opset1::Transpose>(in_value.get_node_shared_ptr())) {
set_layout_from_order(transpose, transpose->input_value(0));
matmul->set_argument(i, transpose->input_value(0));
if (transpose_is_supported(transpose)) {
set_layout_from_order(transpose, transpose->input_value(0));
matmul->set_argument(i, transpose->input_value(0));
}
}
}
// need to run validate_and_infer_types manually: either input shapes were updated or
Expand Down
10 changes: 7 additions & 3 deletions src/common/snippets/src/pass/insert_loops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ std::vector<int64_t> InsertLoops::calculate_finalization_offsets(const ov::Parti
return inner_finalization_offsets;
}

void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& master_shape,
size_t inner_work_amount, size_t outer_work_amount, size_t vector_size) {
void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& master_shape, size_t vector_size) {
ov::NodeVector body;
ov::OutputVector body_parameters;
std::vector<ov::Input<ov::Node>> body_results;
Expand All @@ -67,6 +66,11 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape&

auto apply_increments = InsertLoops::calculate_inner_apply_increments(master_shape, body_shapes);
std::vector<int64_t> inner_finalization_offsets(body_shapes.size(), 0);
auto body_master_shape = body_shapes.front();
for (const auto& shape : body_shapes)
PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY);
const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length();
const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length();
if (outer_work_amount > 1) {
inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(master_shape, body_shapes);
}
Expand Down Expand Up @@ -236,7 +240,7 @@ bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
op::insertLoopEnd(commonResults, outer_loop_begin, 1lu, outer_work_amount, 1lu, apply_increments);
}
} else {
insert_explicitly_loops(ops, m_master_shape, inner_work_amount, outer_work_amount, m_vector_size);
insert_explicitly_loops(ops, m_master_shape, m_vector_size);
}
}

Expand Down
40 changes: 23 additions & 17 deletions src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,10 @@ MatMulEmitter::MatMulEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
}
}
}

load_offset_a = matmul_node->get_offset_a();
load_offset_b = matmul_node->get_offset_b();
store_offset_c = matmul_node->get_offset_c();
}

void MatMulEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, bool use_amx) const {
Expand Down Expand Up @@ -1070,7 +1074,8 @@ void MatMulEmitter::emit_impl(const std::vector<size_t>& in,
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void MatMulEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, int bs,
Reg64 addr_A, Reg64 addr_B,
const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const {
const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
size_t gpr_size = 8;
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
Expand Down Expand Up @@ -1120,8 +1125,15 @@ void MatMulEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in
// todo: Windows ABI : requires different num of arguments passed in regs and on the stack. Need to align.
h->mov(abi_param1, reinterpret_cast<uintptr_t>(brgKernel));
h->mov(abi_param2, bs);
h->uni_vmovq(abi_param3, Xmm(0));
h->uni_vmovq(abi_param4, Xmm(1));

const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t memory_bytes_offset, size_t kernel_bytes_offset) {
h->uni_vmovq(reg, xmm);
if (memory_bytes_offset) h->add(reg, memory_bytes_offset);
if (kernel_bytes_offset) h->add(reg, kernel_bytes_offset);
};
data_ptr(Xmm(0), abi_param3, load_offset_a, in0_kernel_offset);
data_ptr(Xmm(1), abi_param4, load_offset_b, in1_kernel_offset);

size_t num_args_passed_on_stack = 1;
#ifdef _WIN32
num_args_passed_on_stack = 3;
Expand All @@ -1130,9 +1142,11 @@ void MatMulEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in
h->mov(h->qword[h->rsp], reinterpret_cast<uint64_t>(scratch));
h->mov(h->qword[h->rsp + gpr_size], reinterpret_cast<uintptr_t>(batch));
h->mov(h->qword[h->rsp + 2 * gpr_size], Xmm(2));
if (store_offset_c) h->add(h->qword[h->rsp + 2 * gpr_size], store_offset_c);
if (out0_kernel_offset) h->add(h->qword[h->rsp + 2 * gpr_size], out0_kernel_offset);
#else
h->mov(abi_param5, reinterpret_cast<uintptr_t>(batch));
h->uni_vmovq(abi_param6, Xmm(2));
data_ptr(Xmm(2), abi_param6, store_offset_c, out0_kernel_offset);
h->sub(h->rsp, gpr_size);
h->mov(h->qword[h->rsp], reinterpret_cast<uint64_t>(scratch));
#endif
Expand Down Expand Up @@ -1194,25 +1208,17 @@ void MatMulEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<si
const size_t in0_offset = (k * K0_step0 + mb * M_blk * brgemmCtx.LDA) * io_data_size[0];
const size_t in1_offset = (k * K0_step1 + n * N0_step0) * io_data_size[1];
const size_t out0_offset = (n * N0_step1 + mb * M_blk * brgemmCtx.LDC) * io_data_size[2];
if (in0_offset != 0)
h->add(input_0, in0_offset);
if (in1_offset != 0)
h->add(input_1, in1_offset);
if (out0_offset != 0)
h->add(output_0, out0_offset);

emit_brgemm_kernel_call<isa>(brgKernels0[getBrgIdx(mIdx, k, n)].get(),
1,
input_0,
input_1,
nullptr,
output_0,
nullptr);
if (in0_offset != 0)
h->sub(input_0, in0_offset);
if (in1_offset != 0)
h->sub(input_1, in1_offset);
if (out0_offset != 0)
h->sub(output_0, out0_offset);
nullptr,
in0_offset,
in1_offset,
out0_offset);
}
}
}
Expand Down
7 changes: 6 additions & 1 deletion src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,8 @@ class MatMulEmitter : public jit_emitter {
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, int bs,
Reg64 addr_A, Reg64 addr_B,
const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const;
const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const;

static constexpr size_t MHA_BRGEMM_KERNELS_NUM = 8;
static constexpr size_t matmulOptimalM = 32;
Expand All @@ -490,6 +491,10 @@ class MatMulEmitter : public jit_emitter {
size_t M, M_blk, M_tail;
size_t K0, K0_blk, K0_tail, N0, N0_blk, N0_tail;
size_t brg0VnniFactor;

size_t load_offset_a = 0lu;
size_t load_offset_b = 0lu;
size_t store_offset_c = 0lu;
};

} // namespace intel_cpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ std::vector<std::vector<ov::PartialShape>> input_shapes{
{{1, 2, 69, 43}, {2, 1, 43, 49}}
};
std::vector<element::Type> precisions{element::f32};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul,
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMul, MatMul,
::testing::Combine(
::testing::ValuesIn(input_shapes),
::testing::ValuesIn(precisions),
Expand All @@ -32,6 +32,42 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul,
::testing::Values(CommonTestUtils::DEVICE_CPU)),
MatMul::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias,
::testing::Combine(
::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 1, 43, 49}, {1, 1, 69, 49}}),
::testing::ValuesIn(precisions),
::testing::Values(4), // Sinh * 3 + Subgraph;
::testing::Values(1), // Tokenized MatMul+Bias
::testing::Values(CommonTestUtils::DEVICE_CPU)),
MatMul::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul,
::testing::Combine(
::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}}),
::testing::ValuesIn(precisions),
::testing::Values(3), // Sinh * 2 + Subgraph;
::testing::Values(1), // Tokenized MatMul+Bias
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ExplicitTransposeMatMul::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias,
::testing::Combine(
::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}}),
::testing::ValuesIn(precisions),
::testing::Values(4), // Sinh * 3 + Subgraph;
::testing::Values(1), // Tokenized MatMul+Bias
::testing::Values(CommonTestUtils::DEVICE_CPU)),
MatMul::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMulMatMulBias, ExplicitTransposeMulMatMulBias,
::testing::Combine(
::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 2, 1, 1}, {1, 1, 69, 49}}),
::testing::ValuesIn(precisions),
::testing::Values(5), // Sinh * 4 + Subgraph;
::testing::Values(1), // Tokenized MatMul+Bias
::testing::Values(CommonTestUtils::DEVICE_CPU)),
MatMul::getTestCaseName);

namespace transpose_zero_input {
std::vector<std::vector<ov::PartialShape>> transpose_input_shapes{
{{2, 69, 3, 43}, {2, 3, 43, 49}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,17 @@ namespace snippets {

namespace {

const std::vector<ov::Shape> inputShape = {
ov::Shape{1, 128, 3, 16},
const std::vector<ov::Shape> inputShapes = {
{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64},
};

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmax, TransposeSoftmax,
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHA,
::testing::Combine(
::testing::Values(inputShape),
::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
::testing::Values(-1),
::testing::Values(2), // Subgraph + Sin
::testing::Values(inputShapes),
::testing::Values(5), // Subgraph + 4xSin
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
TransposeSoftmax::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmaxEltwise, TransposeSoftmaxEltwise,
::testing::Combine(
::testing::Values(inputShape),
::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
::testing::Values(-1),
::testing::Values(2), // Subgraph + Sin
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
TransposeSoftmax::getTestCaseName);
MHA::getTestCaseName);

} // namespace
} // namespace snippets
Expand Down
Loading

0 comments on commit b6fe09b

Please sign in to comment.