Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Snippets][CPU] Added FP32 MHA tokenization support #14327

Merged
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0075a5f
[Snippets][CPU] Explicit loop (#55)
IvanNovoselov Nov 29, 2022
a748dac
Sns explicit tiles leftovers (#60)
IvanNovoselov Nov 30, 2022
5d3e6f4
Sns transpose support (#56)
IvanNovoselov Dec 1, 2022
084a844
Sns transpose leftovers (#62)
IvanNovoselov Dec 2, 2022
6cb1bae
Sns matmul support (#61)
IvanNovoselov Dec 5, 2022
f86fd91
[Snippets] Added Softmax support (#57)
a-sidorova Dec 21, 2022
729e216
[Snippets] Added Select, Broadcast support (#63)
a-sidorova Dec 28, 2022
2bb6f4a
[Snippets] MHA Tokenization (#59)
a-sidorova Jan 9, 2023
cba38ae
Merge remote-tracking branch 'upstream/master' into feature/snippets/mha
a-sidorova Jan 9, 2023
2a4ab82
Updated MHA Custom tests
a-sidorova Jan 9, 2023
457738b
Some fixes for transformations
a-sidorova Jan 9, 2023
ebc1eac
Merge remote-tracking branch 'upstream/master' into feature/snippets/mha
a-sidorova Jan 10, 2023
85e3a20
Added skipping tests for non-AVX512, fixed Select
a-sidorova Jan 10, 2023
1722e43
Fixed MatMul on Win
a-sidorova Jan 11, 2023
7bdf90b
Reverted shape for ConvEltwise test
a-sidorova Jan 11, 2023
7cd17ff
Increased threshold for attention onnx model
a-sidorova Jan 11, 2023
bc47131
Merge remote-tracking branch 'upstream/master' into feature/snippets/mha
a-sidorova Jan 11, 2023
a89b262
Increased onnx test tol one more time
a-sidorova Jan 11, 2023
c33b941
Merge remote-tracking branch 'upstream/master' into feature/snippets/mha
a-sidorova Jan 11, 2023
6cce86e
Merge branch 'master' into feature/snippets/mha
IvanNovoselov Jan 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/common/snippets/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ ie_faster_build(${TARGET_NAME}
)

target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime
PRIVATE ngraph_reference openvino::runtime::dev)
PRIVATE ngraph_reference ov_shape_inference openvino::runtime::dev)

target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>)
target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>
PRIVATE $<BUILD_INTERFACE:${SHAPE_INFER_INCLUDE_DIR}>)

add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})

Expand Down
30 changes: 26 additions & 4 deletions src/common/snippets/include/snippets/generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,15 @@ class Schedule {
* @param f can this kernel be linearided to 1D range
* @param p pointer to generated code
*/
Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
/**
* @brief Returns callable instanse of code pointer
*/
template<typename K> K get_callable() const {
return reinterpret_cast<K>(const_cast<unsigned char*>(ptr));
}

Shape work_size {};
ov::PartialShape work_size {};
bool is_flat {false};
code ptr {nullptr};
};
Expand All @@ -112,21 +112,43 @@ class Generator {
* @brief Default destructor
*/
virtual ~Generator() = default;
/**
* @interface GeneratorConfig
* @brief Allows to tweak the lowering process.
*/
class GeneratorConfig {
public:
// True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
bool m_save_lowered_code = false;
// True if we can optimize tails for single evaluation during code generation
// More details with optimization examples you can see in generate() method
// For example, tails with Buffer ops doesn't support single evaluation optimizations
// because of that we should always reset memory pointer using finalization offsets
// after data storing to Buffer
bool m_optimize_single_evaluation = true;
// True if we should check runtime info for nodes to call specific needed transformations
bool m_need_fill_tail_register = false;
};
/**
* @brief virtual method any specific implementation should implement
* @param m model in canonical for for table-based code generation
* @param config config with transformation and optimization parameters
* @param compile_params parameters for generated code
* @return pointer to generated code
*/
code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
code generate(std::shared_ptr<ov::Model>& m, const GeneratorConfig& config, const void* compile_params = nullptr);

/**
* @brief gets target machine
* @return pointer to constant target machine
*/
std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
std::shared_ptr<const TargetMachine> get_target_machine() const;

protected:
std::shared_ptr<TargetMachine> target;
// todo: we need to save lowered code to access compiled brgemm kernels on execution time (normally lowered is destructed by then).
// This is temporary solution, remove this when kernel caching is implemented. Don't forget to make generate const method.
std::vector<AllocatedEmitter> lowered_saved;
};

} // namespace snippets
Expand Down
47 changes: 47 additions & 0 deletions src/common/snippets/include/snippets/op/brgemm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "ngraph/op/op.hpp"
#include "ngraph/op/matmul.hpp"

namespace ngraph {
namespace snippets {
namespace op {

/**
* @interface Brgemm
* @brief Brgemm is a batch-reduced matrix multiplication with the support of arbitrary strides between matrices rows
* @ingroup snippets
*/
class Brgemm : public ngraph::op::v0::MatMul {
public:
OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul);
Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu);
Brgemm() = default;

bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;

bool has_evaluate() const override { return false; }

size_t get_offset_a() const { return m_offset_a; }
size_t get_offset_b() const { return m_offset_b; }
size_t get_offset_c() const { return m_offset_c; }

void set_offset_a(const size_t offset) { m_offset_a = offset; }
void set_offset_b(const size_t offset) { m_offset_b = offset; }
void set_offset_c(const size_t offset) { m_offset_c = offset; }

private:
size_t m_offset_a = 0lu; // offset for first input
size_t m_offset_b = 0lu; // offset for second input
size_t m_offset_c = 0lu; // offset for output
};

} // namespace op
} // namespace snippets
} // namespace ngraph
12 changes: 9 additions & 3 deletions src/common/snippets/include/snippets/op/broadcastload.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,20 @@ class BroadcastLoad : public BroadcastMove {
public:
OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove);

BroadcastLoad(const Output<Node>& x, Shape output_shape);
BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
BroadcastLoad() = default;

std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
size_t get_offset() const { return m_offset; }
void set_offset(const size_t offset) { m_offset = offset; }

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;

private:
size_t m_offset = 0lu;
};

} // namespace op
} // namespace snippets
} // namespace ngraph
} // namespace ngraph
7 changes: 2 additions & 5 deletions src/common/snippets/include/snippets/op/broadcastmove.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op {
public:
OPENVINO_OP("BroadcastMove", "SnippetsOpset");

BroadcastMove(const Output<Node>& x, Shape output_shape);
BroadcastMove(const Output<Node>& x, ov::PartialShape output_shape);
BroadcastMove() = default;

bool visit_attributes(AttributeVisitor& visitor) override;
Expand All @@ -28,12 +28,9 @@ class BroadcastMove : public ngraph::op::Op {

void validate_and_infer_types() override;

OPENVINO_SUPPRESS_DEPRECATED_START
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
OPENVINO_SUPPRESS_DEPRECATED_END

protected:
Shape output_shape;
ov::PartialShape output_shape;
};

} // namespace op
Expand Down
47 changes: 47 additions & 0 deletions src/common/snippets/include/snippets/op/buffer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <ngraph/op/op.hpp>

namespace ngraph {
namespace snippets {
namespace op {

/**
* @interface Buffer
* @brief The operation is for intermediate data storage
* - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
* It's needed to allocate needed memory size that depends on Tile rank, for example.
* Default value is -1 (full shape)
* Notes:
* - All buffers in a graph have the same memory pointer. So if we have a few buffers,
* each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
* - Buffer should be a single consumer for operation output port
* @ingroup snippets
*/
class Buffer : public ngraph::op::Op {
public:
OPENVINO_OP("Buffer", "SnippetsOpset");

Buffer(const Output<Node>& x, const int32_t allocation_rank = -1);
Buffer() = default;

int32_t get_allocation_rank() const { return m_allocation_rank; }
void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; }

size_t get_byte_size() const;

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;

private:
int32_t m_allocation_rank = -1;
};

} // namespace op
} // namespace snippets
} // namespace ngraph
47 changes: 47 additions & 0 deletions src/common/snippets/include/snippets/op/fill.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <ngraph/op/op.hpp>

namespace ngraph {
namespace snippets {
namespace op {

/**
* @interface Fill
* @brief Generated in Tail Loop vector representation in code generation step for cases when we should
* refill registers by special values.
* For example, for cases with ReduceMax or ReduceSum in Softmax
* Where:
* - offset - starting element index where filling is performed while beginning of input data is untouched
* - fill_value - hexadecimal filling value
* @ingroup snippets
*/
class Fill : public ngraph::op::Op {
public:
OPENVINO_OP("Fill", "SnippetsOpset");

Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value = 0x0);
Fill() = default;

size_t get_offset() const { return m_offset; }
uint32_t get_fill_value() const { return m_fill_value; }

void set_offset(const size_t offset) { m_offset = offset; }
void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; }

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;

protected:
size_t m_offset = 0lu;
uint32_t m_fill_value = 0x0;
};

} // namespace op
} // namespace snippets
} // namespace ngraph
32 changes: 32 additions & 0 deletions src/common/snippets/include/snippets/op/horizon_max.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "ngraph/op/op.hpp"

namespace ngraph {
namespace snippets {
namespace op {

/**
* @interface HorizonMax
* @brief The operation calculates a horizon maximum of a vector register
* @ingroup snippets
*/
class HorizonMax : public ngraph::op::Op {
public:
OPENVINO_OP("HorizonMax", "SnippetsOpset");

HorizonMax(const Output<Node>& x);
HorizonMax() = default;

bool visit_attributes(AttributeVisitor& visitor) override { return true;}
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
};

} // namespace op
} // namespace snippets
} // namespace ngraph
32 changes: 32 additions & 0 deletions src/common/snippets/include/snippets/op/horizon_sum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "ngraph/op/op.hpp"

namespace ngraph {
namespace snippets {
namespace op {

/**
* @interface HorizonSum
* @brief The operation calculates a horizon sum of a vector register
* @ingroup snippets
*/
class HorizonSum : public ngraph::op::Op {
public:
OPENVINO_OP("HorizonSum", "SnippetsOpset");

HorizonSum(const Output<Node>& x);
HorizonSum() = default;

bool visit_attributes(AttributeVisitor& visitor) override { return true;}
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
};

} // namespace op
} // namespace snippets
} // namespace ngraph
7 changes: 4 additions & 3 deletions src/common/snippets/include/snippets/op/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@ class Kernel : public ngraph::op::Op {
public:
OPENVINO_OP("Kernel", "SnippetsOpset");

Kernel(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
Kernel(std::vector<AllocatedEmitter> region, std::shared_ptr<const ov::Model> m);
Kernel() = default;

std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
std::vector<AllocatedEmitter> region;
const std::shared_ptr<const ov::Model> model;

std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
return std::make_shared<Kernel>(region);
return std::make_shared<Kernel>(region, model);
}
const void *compile_params = nullptr;
};
Expand Down
Loading