Skip to content

Commit

Permalink
Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Jan 12, 2023
1 parent 42e7c79 commit 61c6b42
Show file tree
Hide file tree
Showing 20 changed files with 149 additions and 95 deletions.
4 changes: 4 additions & 0 deletions src/common/snippets/include/snippets/op/brgemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ class Brgemm : public MemoryAccess {
bool transposed_a() const { return m_transposed_a; }
bool transposed_b() const { return m_transposed_b; }

size_t get_offset_a() const { return get_input_port_descriptor(0).m_offset; }
size_t get_offset_b() const { return get_input_port_descriptor(1).m_offset; }
size_t get_offset_c() const { return get_output_port_descriptor(0).m_offset; }

bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
Expand Down
2 changes: 2 additions & 0 deletions src/common/snippets/include/snippets/op/broadcastload.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class BroadcastLoad : public MemoryAccess {
BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
BroadcastLoad() = default;

size_t get_offset() const { return get_input_port_descriptor(0).m_offset; }

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
Expand Down
25 changes: 13 additions & 12 deletions src/common/snippets/include/snippets/op/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,7 @@ namespace op {

/**
* @interface Buffer
* @brief The operation is for intermediate data storage
* TODO
* - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
* It's needed to allocate needed memory size that depends on Tile rank, for example.
* Default value is -1 (full shape)
* - m_static_shape - static shape that describes Buffer size in cases when Buffer doesn't have parent node.
* - m_element_type - element type in cases when Buffer doesn't have parent node.
* - m_single - True if Buffer doesn't have parent node else False
* @brief This is a base class for memory storage.
* Notes:
* - All buffers in a graph have the same memory pointer. So if we have a few buffers,
* each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
Expand All @@ -39,8 +32,8 @@ class Buffer : public ngraph::op::Op {

/**
* @interface AllocationBuffer
* @brief The operation is for allocation new empty memory
* TODO
* @brief The operation is for allocation of new empty memory. The operation has one parent that is equal to allocation shape
* - m_element_type - element type of memory
* @ingroup snippets
*/
class AllocationBuffer : public Buffer {
Expand All @@ -62,8 +55,16 @@ class AllocationBuffer : public Buffer {

/**
* @interface IntermediateBuffer
* @brief The operation is for intermediate data storage
* TODO
* @brief The operation is for intermediate data storage.
* If Buffer has only one parent, the Buffer will allocate a full memory with input shape of Buffer.
* If Buffer has second parent as well, the Buffer will allocate memory with shape that is equal to values from second input but
* saves the input shape for shape inference and input element type.
* For example,
* Parameter [5, 3, 128] Constant [2] (with values {3, 128})
* \ /
* Buffer with allocated memory 3x128 size
* |
* Result [5, 3, 128]
* @ingroup snippets
*/
class IntermediateBuffer : public Buffer {
Expand Down
3 changes: 3 additions & 0 deletions src/common/snippets/include/snippets/op/load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class Load : public MemoryAccess {
Load(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
Load() = default;

size_t get_offset() const { return get_input_port_descriptor(0).m_offset; }
size_t get_count() const { return get_input_port_descriptor(0).m_count; }

void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
};
Expand Down
3 changes: 3 additions & 0 deletions src/common/snippets/include/snippets/op/store.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class Store : public MemoryAccess {
Store(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
Store() = default;

size_t get_offset() const { return get_output_port_descriptor(0).m_offset; }
size_t get_count() const { return get_output_port_descriptor(0).m_count; }

void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
};
Expand Down
4 changes: 2 additions & 2 deletions src/common/snippets/src/op/memory_access.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void MemoryAccess::set_input_port_descriptor(const PortDescriptor& desc, const s
}

PortDescriptor MemoryAccess::get_input_port_descriptor(const size_t i) const {
// We cannot use the same way as in ov::Node::get_input_descriptor because this method must be static
// We cannot use the same way as in ov::Node::get_input_descriptor because this method must be const
// to allow call const Derived::clone_with_new_inputs() method
NGRAPH_CHECK(i < m_input_ports.size(), "Index of input port descriptor should be less than count of input ports");
return m_input_ports[i];
Expand All @@ -60,7 +60,7 @@ void MemoryAccess::set_output_port_descriptor(const PortDescriptor& desc, const
}

PortDescriptor MemoryAccess::get_output_port_descriptor(const size_t i) const {
// We cannot use the same way as in ov::Node::get_input_descriptor because this method must be static
// We cannot use the same way as in ov::Node::get_input_descriptor because this method must be const
// to allow call const Derived::clone_with_new_inputs() method
NGRAPH_CHECK(i < m_output_ports.size(), "Index of output port descriptor should be less than count of output ports");
return m_output_ports[i];
Expand Down
11 changes: 2 additions & 9 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,8 @@ void snippets::op::Subgraph::initialize_buffer_scratchpad_size() {

// Propagate to up: in Store. Buffer can have only one Store
{
if (buffer->get_input_size() > 0) {
auto parent = buffer->get_input_node_shared_ptr(0);
auto parent = buffer->get_input_node_shared_ptr(0);
if (!ov::is_type<ngraph::op::v0::Constant>(parent)) {
auto idx = buffer->input(0).get_source_output().get_index();
while (ov::is_type<snippets::op::LoopBase>(parent)) {
const auto source_output = parent->input_value(idx);
Expand Down Expand Up @@ -498,13 +498,6 @@ void snippets::op::Subgraph::initialize_buffer_scratchpad_size() {
for (const auto& op : ops) {
if (const auto buffer = ov::as_type_ptr<ngraph::snippets::op::Buffer>(op)) {
const auto buffer_size = buffer->get_byte_size();
if (ov::is_type<op::IntermediateBuffer>(op)) {
if (op->get_input_size() == 2) {
op->set_arguments({op->get_input_source_output(0)});
}
} else if (ov::is_type<op::AllocationBuffer>(op)) {
op->set_arguments(ov::OutputVector{});
}
// We need to allocate memory for first buffer at least
if (m_buffer_scratchpad == 0) {
m_buffer_scratchpad += buffer_size;
Expand Down
22 changes: 19 additions & 3 deletions src/common/snippets/src/pass/assign_registers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

namespace {
static constexpr size_t reg_count = 16lu;

auto filter_ops(const std::shared_ptr<ov::Node>& op) -> bool {
if (ov::is_type<ngraph::op::v0::Constant>(op) &&
ov::is_type<ngraph::snippets::op::Buffer>(op->get_output_target_inputs(0).begin()->get_node()))
return false;
return true;
}

} // namespace

bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
Expand All @@ -19,8 +27,12 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
auto ops = f->get_ordered_ops();

std::vector<std::pair<TargetMachine::opRegType, std::shared_ptr<Node>>> typed_ops;
for (const auto& op : ops)
typed_ops.emplace_back(std::make_pair(m_target_machine->get_op_reg_type(op), op));
for (const auto& op : ops) {
if (filter_ops(op)) {
typed_ops.emplace_back(std::make_pair(m_target_machine->get_op_reg_type(op), op));
}
}

size_t counter_vec = 0;
size_t counter_gpr = 0;
std::map<tensor, Reg> regs_vec, regs_gpr;
Expand Down Expand Up @@ -120,8 +132,12 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
for (size_t i = 0; i < typed_ops.size(); i++) {
const auto& t_op = typed_ops[i];
std::vector<tensor> used_tensors, defined_tensors;
for (const auto& in : t_op.second->inputs())
for (const auto& in : t_op.second->inputs()) {
if (ov::is_type<snippets::op::Buffer>(t_op.second) &&
ov::is_type<opset1::Constant>(t_op.second->get_input_node_shared_ptr(in.get_index())))
continue;
used_tensors.push_back(in.get_tensor_ptr());
}
for (const auto& out : t_op.second->outputs())
defined_tensors.push_back(out.get_tensor_ptr());
switch (t_op.first) {
Expand Down
6 changes: 0 additions & 6 deletions src/common/snippets/src/pass/insert_loops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,6 @@ bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
if (m_master_shape.is_dynamic())
throw ngraph_error("InsertLoops doesn't support dynamic shapes yet");

ov::pass::Serialize("/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops.xml",
"/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops.bin").run_on_model(model);

const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length();
const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1;

Expand Down Expand Up @@ -285,9 +282,6 @@ bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
}
}

ov::pass::Serialize("/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops_after.xml",
"/home/a-sidorova/projects/mha_matmul/openvino/graphs/loops_after.bin").run_on_model(model);

return true;
}

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
jitters[ngraph::snippets::op::AllocationBuffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
jitters[ngraph::snippets::op::IntermediateBuffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(VectorBufferEmitter);
// jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported
jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // Not supported

jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
jitters[ngraph::snippets::op::LoadReshape::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
Expand Down
71 changes: 47 additions & 24 deletions src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,9 +548,8 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c
IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();

const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(n);
const auto desc = store->get_output_port_descriptor(0);
count = desc.m_count;
byte_offset = desc.m_offset;
count = store->get_count();
byte_offset = store->get_offset();
in_out_type_ = emitter_in_out_map::vec_to_gpr;
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
}
Expand Down Expand Up @@ -590,9 +589,8 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu
IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();

const auto load = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(n);
const auto desc = load->get_input_port_descriptor(0);
count = desc.m_count;
byte_offset = desc.m_offset;
count = load->get_count();
byte_offset = load->get_offset();
in_out_type_ = emitter_in_out_map::gpr_to_vec;
load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
}
Expand Down Expand Up @@ -632,8 +630,7 @@ BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator*
IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();

const auto broadcast_load = std::dynamic_pointer_cast<ngraph::snippets::op::BroadcastLoad>(n);
const auto desc = broadcast_load->get_input_port_descriptor(0);
byte_offset = desc.m_offset;
byte_offset = broadcast_load->get_offset();
in_out_type_ = emitter_in_out_map::gpr_to_vec;
}

Expand Down Expand Up @@ -673,9 +670,8 @@ void BroadcastLoadEmitter::emit_isa(const std::vector<size_t> &in, const std::ve
LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n) {
const auto load = ov::as_type_ptr<ngraph::snippets::op::Load>(n);
const auto desc = load->get_input_port_descriptor(0);
count = desc.m_count;
byte_offset = desc.m_offset;
count = load->get_count();
byte_offset = load->get_offset();
in_out_type_ = emitter_in_out_map::gpr_to_vec;
load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
}
Expand Down Expand Up @@ -710,9 +706,8 @@ void LoadConvertEmitter::emit_data() const {
StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(n);
const auto desc = store->get_output_port_descriptor(0);
count = desc.m_count;
byte_offset = desc.m_offset;
count = store->get_count();
byte_offset = store->get_offset();
in_out_type_ = emitter_in_out_map::vec_to_gpr;

if (ov::is_type<ov::intel_cpu::StoreConvertTruncation>(n)) {
Expand Down Expand Up @@ -848,10 +843,11 @@ BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
}
}

load_offset_a = brgemm_node->get_input_port_descriptor(0).m_offset;
load_offset_b = brgemm_node->get_input_port_descriptor(1).m_offset;
load_offset_scratch = brgemm_node->get_input_port_descriptor(2).m_offset;
store_offset_c = brgemm_node->get_output_port_descriptor(0).m_offset;
load_offset_a = brgemm_node->get_offset_a();
load_offset_b = brgemm_node->get_offset_b();
store_offset_c = brgemm_node->get_offset_c();
if (with_scratch)
load_offset_scratch = brgemm_node->get_offset_scratch();
}

void BrgemmEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, bool use_amx) const {
Expand Down Expand Up @@ -938,8 +934,35 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
Reg64 addr_A, Reg64 addr_B, Reg64 scratch, Reg64 addr_C,
const size_t in0_kernel_offset, const size_t in1_kernel_offset,
const size_t in2_kernel_offset, const size_t out0_kernel_offset) const {
if (ctx.is_with_amx)
amx_tile_configure(ctx.palette);
if (ctx.is_with_amx) {
size_t gpr_size = 8;
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);

h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);

// save function address in gpr to pass in call instruction
const auto& overload = static_cast<status_t(*)(const char*)>(amx_tile_configure);
h->mov(h->rbp, reinterpret_cast<uintptr_t>(overload));
h->mov(abi_param1, reinterpret_cast<uintptr_t>(ctx.palette));

// align stack on 16-byte as ABI requires
// note that RBX must not be changed by the callee
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);

h->call(h->rbp);

h->add(h->rsp, h->rbx);
// restore gpr registers
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
}

size_t gpr_size = 8;
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
Expand Down Expand Up @@ -1073,10 +1096,10 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(dnnl::impl::cpu::x64::jit_generator* h, d
brgemm_prc_in1 = brgemm_repack->get_input_element_type(0);
brgemmVNNIFactor = 4 / brgemm_prc_in0.size();
with_comp = brgemm_repack->is_with_comp();
in_offset = brgemm_repack->get_input_port_descriptor(0).m_offset;
out_offset = brgemm_repack->get_output_port_descriptor(0).m_offset;
in_offset = brgemm_repack->get_offset_in();
out_offset = brgemm_repack->get_offset_out();
if (with_comp)
comp_offset = brgemm_repack->get_output_port_descriptor(1).m_offset;
comp_offset = brgemm_repack->get_offset_comp();

auto layout = ngraph::snippets::utils::get_node_output_layout(brgemm_repack->get_input_node_shared_ptr(0));
const auto& original_shape = brgemm_repack->get_input_shape(0);
Expand Down Expand Up @@ -1113,7 +1136,7 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(dnnl::impl::cpu::x64::jit_generator* h, d

const auto dt_in0 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::IEPrecisionToDataType(InferenceEngine::details::convertPrecision(brgemm_prc_in0)));
const auto dt_in1 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::IEPrecisionToDataType(InferenceEngine::details::convertPrecision(brgemm_prc_in1)));
init_brgemm_copy(kernel, leading_dimension, N_blk, N_tail, LDB, (K_tail == 0 ? K : K_tail), use_amx, dt_in0, dt_in1);
init_brgemm_copy(kernel, leading_dimension, N_blk, N_tail, LDB, K - K_tail, use_amx, dt_in0, dt_in1);
}

void BrgemmCopyBEmitter::init_brgemm_copy(std::unique_ptr<matmul::jit_brgemm_matmul_copy_b_t>& kernel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::BrgemmToBrgemmCPU")
auto& pm = m.get_pattern_value_map();
const auto brgemm = ov::as_type_ptr<ngraph::snippets::op::Brgemm>(pm.at(m_brgemm).get_node_shared_ptr());
if (!brgemm)
const auto brgemm_plugin = ov::as_type_ptr<BrgemmCPU>(pm.at(m_brgemm).get_node_shared_ptr());
if (!brgemm || brgemm_plugin)
return false;

if (brgemm->get_input_partial_shape(0).is_dynamic() || brgemm->get_input_partial_shape(1).is_dynamic()) {
Expand Down Expand Up @@ -70,8 +71,8 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() {
if (with_amx || with_comp) {
std::shared_ptr<ngraph::snippets::op::Buffer> scratch = nullptr;
if (with_amx) {
const auto scratch_size = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int32_t>{4 * 1024});
scratch = std::make_shared<ngraph::snippets::op::AllocationBuffer>(scratch_size, ov::element::i32);
const auto scratch_size = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int32_t>{32 * 1024});
scratch = std::make_shared<ngraph::snippets::op::AllocationBuffer>(scratch_size, ov::element::u8);
} else if (with_comp) {
scratch = std::make_shared<ngraph::snippets::op::IntermediateBuffer>(brgemmRepackIn1->output(1));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ void intel_cpu::BrgemmCopyB::validate_and_infer_types() {
if (m_with_comp) {
set_output_type(1, ov::element::f32, ov::PartialShape{ov::Dimension::dynamic()});
}

return;
}

Expand Down
Loading

0 comments on commit 61c6b42

Please sign in to comment.