Changes for MatMul

openvinotoolkit · Nov 30, 2022 · b6fe09b · b6fe09b
1 parent 60d3ea5
commit b6fe09b
Show file tree

Hide file tree

Showing 20 changed files with 540 additions and 107 deletions.
diff --git a/src/common/snippets/include/snippets/op/matmul_cpu.hpp b/src/common/snippets/include/snippets/op/matmul_cpu.hpp
@@ -20,7 +20,7 @@ namespace op {
 class MatMulCPU : public ngraph::op::v0::MatMul {
 public:
     OPENVINO_OP("MatMulCPU", "SnippetsOpset", ngraph::op::v0::MatMul);
-    MatMulCPU(const Output<Node>& A, const Output<Node>& B);
+    MatMulCPU(const Output<Node>& A, const Output<Node>& B, size_t offset_a = 0, size_t offset_b = 0, size_t offset_c = 0);
     MatMulCPU() = default;
 
     bool visit_attributes(AttributeVisitor& visitor) override;
@@ -29,9 +29,20 @@ class MatMulCPU : public ngraph::op::v0::MatMul {
 
     bool has_evaluate() const override { return false; }
 
+    size_t get_offset_a() const { return m_offset_a; }
+    size_t get_offset_b() const { return m_offset_b; }
+    size_t get_offset_c() const { return m_offset_c; }
+
+    void set_offset_a(size_t offset) { m_offset_a = offset; }
+    void set_offset_b(size_t offset) { m_offset_b = offset; }
+    void set_offset_c(size_t offset) { m_offset_c = offset; }
+
 private:
-    MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout);
+    MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout, size_t offset_a = 0, size_t offset_b = 0, size_t offset_c = 0);
     std::vector<size_t> m_output_layout;
+    size_t m_offset_a = 0lu;
+    size_t m_offset_b = 0lu;
+    size_t m_offset_c = 0lu;
 };
 
 } // namespace op

diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp
@@ -56,6 +56,9 @@ void snippets::op::Buffer::set_offset(const size_t offset) {
         }
         if (auto store = std::dynamic_pointer_cast<snippets::op::Store>(parent)) {
             store->set_offset(m_offset);
+        } else if (auto matmul = std::dynamic_pointer_cast<snippets::op::MatMulCPU>(parent)) {
+            // MatMul encapsulates work with Loops inside himself
+            matmul->set_offset_c(m_offset);
         } else {
             throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation");
         }
@@ -73,6 +76,13 @@ void snippets::op::Buffer::set_offset(const size_t offset) {
                 }
             } else if (const auto load = std::dynamic_pointer_cast<snippets::op::Load>(child)) {
                 load->set_offset(m_offset);
+            } else if (auto matmul = std::dynamic_pointer_cast<snippets::op::MatMulCPU>(child)) {
+                // MatMul encapsulates work with Loops inside himself
+                if (target_input.get_index() == 0) {
+                    matmul->set_offset_a(m_offset);
+                } else {
+                    matmul->set_offset_b(m_offset);
+                }
             } else {
                 throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation");
             }

diff --git a/src/common/snippets/src/op/matmul_cpu.cpp b/src/common/snippets/src/op/matmul_cpu.cpp
@@ -12,21 +12,25 @@ namespace ngraph {
 namespace snippets {
 namespace op {
 
-MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B) : MatMul(), m_output_layout({}) {
+MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B, size_t offset_a, size_t offset_b, size_t offset_c)
+    : MatMul(), m_output_layout({}), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
     set_arguments({A, B});
     set_output_size(1);
     constructor_validate_and_infer_types();
 }
 
-MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout)
-    : MatMul(), m_output_layout(std::move(output_layout)) {
+MatMulCPU::MatMulCPU(const Output<Node>& A, const Output<Node>& B, std::vector<size_t> output_layout, size_t offset_a, size_t offset_b, size_t offset_c)
+    : MatMul(), m_output_layout(std::move(output_layout)), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
     set_arguments({A, B});
     set_output_size(1);
     constructor_validate_and_infer_types();
 }
 
 bool MatMulCPU::visit_attributes(AttributeVisitor& visitor) {
     INTERNAL_OP_SCOPE(MatMulCPU_visit_attributes);
+    visitor.on_attribute("offset_a", m_offset_a);
+    visitor.on_attribute("offset_b", m_offset_b);
+    visitor.on_attribute("offset_c", m_offset_c);
     // todo: should we visit planar shapes?
     //visitor.on_attribute("leading_dimensions", m_leading_dimensions);
     return true;
@@ -81,11 +85,7 @@ void MatMulCPU::validate_and_infer_types() {
 std::shared_ptr<Node> MatMulCPU::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(MatMulCPU_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-//    auto new_matmul = std::make_shared<MatMulCPU>(new_args.at(0), new_args.at(1));
-    return std::shared_ptr<Node>(new MatMulCPU(new_args.at(0), new_args.at(1), m_output_layout));
-//    new_matmul->output_layout = output_layout;
-//    return new_matmul;
-//    return std::make_shared<MatMulCPU>(new_args.at(0), new_args.at(1));
+    return std::shared_ptr<Node>(new MatMulCPU(new_args.at(0), new_args.at(1), m_output_layout, m_offset_a, m_offset_b, m_offset_c));
 }
 
 } // namespace op

diff --git a/src/common/snippets/src/pass/fuse_transpose_and_matmul_cpu.cpp b/src/common/snippets/src/pass/fuse_transpose_and_matmul_cpu.cpp
@@ -49,7 +49,7 @@ FuseTransposeMatMulCPU::FuseTransposeMatMulCPU() {
     auto matmul_out0 = pattern::wrap_type<opset1::Transpose>({matmul_any, constant});
     auto matmul_or_transpose = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{matmul_in0, matmul_in1, matmul_out0});
 
-    auto callback = [](pattern::Matcher& m) {
+    auto callback = [&transpose_is_supported](pattern::Matcher& m) {
         OV_ITT_SCOPED_TASK(pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseTransposeMatMulCPU")
         auto set_layout_from_order = [](const std::shared_ptr<opset1::Transpose>& node, const ov::Output<Node>& port) {
             const auto& const_order = as_type_ptr<opset1::Constant>(node->get_input_node_shared_ptr(1));
@@ -72,8 +72,10 @@ FuseTransposeMatMulCPU::FuseTransposeMatMulCPU() {
         for (int i = 0; i < matmul->get_input_size(); i++) {
             const auto& in_value = matmul->input_value(i);
             if (const auto& transpose = as_type_ptr<opset1::Transpose>(in_value.get_node_shared_ptr())) {
-                set_layout_from_order(transpose, transpose->input_value(0));
-                matmul->set_argument(i, transpose->input_value(0));
+                if (transpose_is_supported(transpose)) {
+                    set_layout_from_order(transpose, transpose->input_value(0));
+                    matmul->set_argument(i, transpose->input_value(0));
+                }
             }
         }
         // need to run validate_and_infer_types manually: either input shapes were updated or

diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp
@@ -48,8 +48,7 @@ std::vector<int64_t> InsertLoops::calculate_finalization_offsets(const ov::Parti
     return inner_finalization_offsets;
 }
 
-void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& master_shape,
-                             size_t inner_work_amount, size_t outer_work_amount, size_t vector_size) {
+void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& master_shape, size_t vector_size) {
     ov::NodeVector body;
     ov::OutputVector body_parameters;
     std::vector<ov::Input<ov::Node>> body_results;
@@ -67,6 +66,11 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape&
 
         auto apply_increments = InsertLoops::calculate_inner_apply_increments(master_shape, body_shapes);
         std::vector<int64_t> inner_finalization_offsets(body_shapes.size(), 0);
+        auto body_master_shape = body_shapes.front();
+        for (const auto& shape : body_shapes)
+            PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY);
+        const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length();
+        const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length();
         if (outer_work_amount > 1) {
             inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(master_shape, body_shapes);
         }
@@ -236,7 +240,7 @@ bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
                 op::insertLoopEnd(commonResults, outer_loop_begin, 1lu, outer_work_amount, 1lu, apply_increments);
             }
         } else {
-            insert_explicitly_loops(ops, m_master_shape, inner_work_amount, outer_work_amount, m_vector_size);
+            insert_explicitly_loops(ops, m_master_shape, m_vector_size);
         }
     }
 

diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@@ -1024,6 +1024,10 @@ MatMulEmitter::MatMulEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
             }
         }
     }
+
+    load_offset_a = matmul_node->get_offset_a();
+    load_offset_b = matmul_node->get_offset_b();
+    store_offset_c = matmul_node->get_offset_c();
 }
 
 void MatMulEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, bool use_amx) const {
@@ -1070,7 +1074,8 @@ void MatMulEmitter::emit_impl(const std::vector<size_t>& in,
 template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void MatMulEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, int bs,
                                             Reg64 addr_A, Reg64 addr_B,
-                                            const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const {
+                                            const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
+                                            const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const {
     using Vmm = typename dnnl::impl::utils::conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
     size_t gpr_size = 8;
     Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
@@ -1120,8 +1125,15 @@ void MatMulEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in
     // todo: Windows ABI : requires different num of arguments passed in regs and on the stack. Need to align.
     h->mov(abi_param1, reinterpret_cast<uintptr_t>(brgKernel));
     h->mov(abi_param2, bs);
-    h->uni_vmovq(abi_param3, Xmm(0));
-    h->uni_vmovq(abi_param4, Xmm(1));
+
+    const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t memory_bytes_offset, size_t kernel_bytes_offset) {
+        h->uni_vmovq(reg, xmm);
+        if (memory_bytes_offset) h->add(reg, memory_bytes_offset);
+        if (kernel_bytes_offset) h->add(reg, kernel_bytes_offset);
+    };
+    data_ptr(Xmm(0), abi_param3, load_offset_a, in0_kernel_offset);
+    data_ptr(Xmm(1), abi_param4, load_offset_b, in1_kernel_offset);
+
     size_t num_args_passed_on_stack = 1;
 #ifdef _WIN32
         num_args_passed_on_stack = 3;
@@ -1130,9 +1142,11 @@ void MatMulEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in
     h->mov(h->qword[h->rsp], reinterpret_cast<uint64_t>(scratch));
     h->mov(h->qword[h->rsp + gpr_size], reinterpret_cast<uintptr_t>(batch));
     h->mov(h->qword[h->rsp + 2 * gpr_size], Xmm(2));
+    if (store_offset_c) h->add(h->qword[h->rsp + 2 * gpr_size], store_offset_c);
+    if (out0_kernel_offset) h->add(h->qword[h->rsp + 2 * gpr_size], out0_kernel_offset);
 #else
     h->mov(abi_param5, reinterpret_cast<uintptr_t>(batch));
-    h->uni_vmovq(abi_param6, Xmm(2));
+    data_ptr(Xmm(2), abi_param6, store_offset_c, out0_kernel_offset);
     h->sub(h->rsp, gpr_size);
     h->mov(h->qword[h->rsp], reinterpret_cast<uint64_t>(scratch));
 #endif
@@ -1194,25 +1208,17 @@ void MatMulEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<si
                     const size_t in0_offset = (k * K0_step0 + mb * M_blk * brgemmCtx.LDA) * io_data_size[0];
                     const size_t in1_offset = (k * K0_step1 + n * N0_step0) * io_data_size[1];
                     const size_t out0_offset = (n * N0_step1 + mb * M_blk * brgemmCtx.LDC) * io_data_size[2];
-                    if (in0_offset != 0)
-                        h->add(input_0, in0_offset);
-                    if (in1_offset != 0)
-                        h->add(input_1, in1_offset);
-                    if (out0_offset != 0)
-                        h->add(output_0, out0_offset);
+
                     emit_brgemm_kernel_call<isa>(brgKernels0[getBrgIdx(mIdx, k, n)].get(),
                                                  1,
                                                  input_0,
                                                  input_1,
                                                  nullptr,
                                                  output_0,
-                                                 nullptr);
-                    if (in0_offset != 0)
-                        h->sub(input_0, in0_offset);
-                    if (in1_offset != 0)
-                        h->sub(input_1, in1_offset);
-                    if (out0_offset != 0)
-                        h->sub(output_0, out0_offset);
+                                                 nullptr,
+                                                 in0_offset,
+                                                 in1_offset,
+                                                 out0_offset);
                 }
             }
         }

diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -479,7 +479,8 @@ class MatMulEmitter : public jit_emitter {
     template <dnnl::impl::cpu::x64::cpu_isa_t isa>
     void emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, int bs,
                                  Reg64 addr_A, Reg64 addr_B,
-                                 const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const;
+                                 const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
+                                 const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const;
 
     static constexpr size_t MHA_BRGEMM_KERNELS_NUM = 8;
     static constexpr size_t matmulOptimalM = 32;
@@ -490,6 +491,10 @@ class MatMulEmitter : public jit_emitter {
     size_t M, M_blk, M_tail;
     size_t K0, K0_blk, K0_tail, N0, N0_blk, N0_tail;
     size_t brg0VnniFactor;
+
+    size_t load_offset_a = 0lu;
+    size_t load_offset_b = 0lu;
+    size_t store_offset_c = 0lu;
 };
 
 }   // namespace intel_cpu

diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -23,7 +23,7 @@ std::vector<std::vector<ov::PartialShape>> input_shapes{
         {{1, 2, 69, 43}, {2, 1, 43, 49}}
 };
 std::vector<element::Type> precisions{element::f32};
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMul, MatMul,
                      ::testing::Combine(
                              ::testing::ValuesIn(input_shapes),
                              ::testing::ValuesIn(precisions),
@@ -32,6 +32,42 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul,
                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                          MatMul::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 1, 43, 49}, {1, 1, 69, 49}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(4), // Sinh * 3 + Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(3), // Sinh * 2 + Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         ExplicitTransposeMatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(4), // Sinh * 3 + Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMulMatMulBias, ExplicitTransposeMulMatMulBias,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 2, 1, 1}, {1, 1, 69, 49}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(5), // Sinh * 4 + Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
 namespace transpose_zero_input {
 std::vector<std::vector<ov::PartialShape>> transpose_input_shapes{
         {{2, 69, 3, 43}, {2, 3, 43, 49}}

diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
@@ -12,29 +12,17 @@ namespace snippets {
 
 namespace {
 
-const std::vector<ov::Shape> inputShape = {
-    ov::Shape{1, 128, 3, 16},
+const std::vector<ov::Shape> inputShapes = {
+        {1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64},
 };
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmax, TransposeSoftmax,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHA,
                      ::testing::Combine(
-                             ::testing::Values(inputShape),
-                             ::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
-                             ::testing::Values(-1),
-                             ::testing::Values(2),  // Subgraph + Sin
+                             ::testing::Values(inputShapes),
+                             ::testing::Values(5),  // Subgraph + 4xSin
                              ::testing::Values(1),
                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                     TransposeSoftmax::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmaxEltwise, TransposeSoftmaxEltwise,
-                         ::testing::Combine(
-                                 ::testing::Values(inputShape),
-                                 ::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
-                                 ::testing::Values(-1),
-                                 ::testing::Values(2),  // Subgraph + Sin
-                                 ::testing::Values(1),
-                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                         TransposeSoftmax::getTestCaseName);
+                     MHA::getTestCaseName);
 
 } // namespace
 } // namespace snippets