diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index bfc7f3e1c8a847..2edbe58a272282 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -123,10 +123,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter); jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); - jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter); jitters[ngraph::snippets::op::TileBegin::get_type_info_static()] = CREATE_EMITTER(TileBeginEmitter); jitters[ngraph::snippets::op::TileEnd::get_type_info_static()] = CREATE_EMITTER(TileEndEmitter); - jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter); } size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const { diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index ceaaba26b64f20..88c8d367ca7a0e 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -146,33 +146,16 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, // master_shape size must be valid in both static and dynamic cases const int64_t offsetRank = jcp.master_shape.size() - 1; std::function init_ptr_with_offset; - if (jcp.is_static) { - init_ptr_with_offset = [&](Reg64 pointer, size_t offset_start_index, Reg64 reg_tmp) { - const int64_t *offsets = jcp.data_offsets + offset_start_index; - for (int j = 0; j < offsetRank; j++) { - if (jcp.master_shape[j] != 1 && offsets[j] != 0) { - h->mov(reg_tmp, offsets[j]); - h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]); - h->add(pointer, reg_tmp); - } - } - }; - } else { - init_ptr_with_offset = [&](Reg64 pointer, size_t offset_displ, Reg64 reg_tmp) { - const size_t data_offests_displ = GET_OFF(data_offsets) + offset_displ * sizeof(int64_t); - // todo: we can pre-filter data_offsets so that only (master_shape[k] != 1 && offsets[k] != 0) are stored there - // but we'll need an additional index array to specify appropriate "k" values for every input - // * size_t num_non_zero_offsets[num_params] - specifies number of non-zero offsets for every input - // * size_t offsetted_indexes* - points to memory chunk sizeof(sum(num_non_zero_offsets) * sizeof(size_t)) - - // specifies indexes of input indexes (reg_index) that need an offset - // * size_t data_offsets* - the same size as offsetted_indexes - offset values for input indexes - for (int j = 0; j < offsetRank; j++) { - h->mov(reg_tmp, h->ptr[reg_const_params + data_offests_displ + j * sizeof(int64_t)]); + init_ptr_with_offset = [&](Reg64 pointer, size_t offset_start_index, Reg64 reg_tmp) { + const int64_t *offsets = jcp.data_offsets + offset_start_index; + for (int j = 0; j < offsetRank; j++) { + if (jcp.master_shape[j] != 1 && offsets[j] != 0) { + h->mov(reg_tmp, offsets[j]); h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]); h->add(pointer, reg_tmp); } - }; - } + } + }; const bool last_iter_explicitly = gp_regs_pool.empty(); Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(gp_regs_pool.back()); size_t i = 0; @@ -191,15 +174,8 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, if (last_iter_explicitly) { h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); reg_tmp = reg_const_params; - if (jcp.is_static) { - // can corrupt reg_const_params, since we won't use it anymore - init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp); - } else { - // have to restore reg_tmp explicitly in dynamic case, can use stack or vector reg - h->push(reg_tmp); - init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp); - h->pop(reg_tmp); - } + // can corrupt reg_const_params, since we won't use it anymore + init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp); } } void KernelEmitter::emit_impl(const std::vector& in, @@ -223,416 +199,15 @@ void KernelEmitter::emit_impl(const std::vector& in, auto local_gpr_pool = gp_regs_pool; // we won't need indexes in both static and dynamic cases, since offsets are already calculated local_gpr_pool.push_back(static_cast(reg_indexes.getIdx())); -// if (jcp.is_static) { -// local_gpr_pool.push_back(static_cast(reg_const_params.getIdx())); -// } for (const auto& c : body) { const auto& emitter = c.first; std::vector in_regs, out_regs; std::tie(in_regs, out_regs) = c.second; - if (auto tile_scheduler = std::dynamic_pointer_cast(emitter)) { - // dynamic TileScheduler needs const runtime params - if (!jcp.is_static) { - in_regs.push_back(static_cast(reg_const_params.getIdx())); - } - out_regs = gp_regs_used; - } emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool); } h->postamble(); } -TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { - const auto tile_scheduler = ov::as_type_ptr(n); - if (!tile_scheduler) - IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument"; - if (tile_scheduler->compile_params == nullptr) - IE_THROW() << "TileSchedulerEmitter invoked with op::TileScheduler that contains no compile_params"; - body = {tile_scheduler->vector_region, tile_scheduler->scalar_region}; - jcp = *reinterpret_cast(tile_scheduler->compile_params); -} -void TileSchedulerEmitter::emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); -} -void TileSchedulerEmitter::validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - if (jcp.is_static && in.size() != 3) - IE_THROW() << "TileSchedulerEmitter (static) got invalid number of inputs. Expected 3, got " << in.size(); - if (!jcp.is_static && in.size() != 4) - IE_THROW() << "TileSchedulerEmitter (dynamic) got invalid number of inputs. Expected 4, got " << in.size(); - if (out.size() != in[0] + in[1]) - IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size(); - if (body.size() != 2) - IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size(); - if (!(std::dynamic_pointer_cast(body[0].first) && std::dynamic_pointer_cast(body[1].first))) - IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body"; -} - -void TileSchedulerEmitter::emit_static_tiles(const Reg64& reg_inner_amount, const std::vector& data_ptr_regs, size_t vector_size, - const std::vector& vec_pool, const std::vector& gpr_pool) const { - // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times - using TileAllocatedEmitter = std::pair, const ngraph::snippets::RegInfo&>; - TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast(body[0].first), body[0].second}; - TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast(body[1].first), body[1].second}; - const size_t inner_work_amount = jcp.scheduler_work_amounts[1]; - const size_t outer_work_amount = jcp.scheduler_work_amounts[0]; - auto process_tile = - [&](const bool evaluate_once, const bool skip_increments, const TileAllocatedEmitter& tile) { - // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks - if (evaluate_once) { - tile.first->emit_body(vec_pool, gpr_pool); - if (!skip_increments) - tile.first->emit_ptr_increments_static(data_ptr_regs); - } else { - std::vector in_regs, out_regs; - std::tie(in_regs, out_regs) = tile.second; - // pass work_amount reg to Tile - in_regs.push_back(static_cast(reg_inner_amount.getIdx())); - for (const auto& reg : data_ptr_regs) - out_regs.emplace_back(reg.getIdx()); - tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool); - } - }; - // todo: these optimizations should be performed on using Tile graph representation in the future - bool vector_evaluate_once = false; - if (inner_work_amount >= vector_size) { - vector_evaluate_once = inner_work_amount < 2 * vector_size; - const bool skip_increments = outer_work_amount == 1 && inner_work_amount == vector_size; - // Need to set proper work amount for inner tiles if evaluated multiple times - if (!vector_evaluate_once) - h->mov(reg_inner_amount, inner_work_amount); - process_tile(vector_evaluate_once, skip_increments, vector_tile); - } - if (inner_work_amount % vector_size >= 1) { - bool scalar_evaluate_once = inner_work_amount % vector_size < 2; - if (!scalar_evaluate_once) { - // vector_tile is not executed, work_amount is not set - if (inner_work_amount < vector_size) { - h->mov(reg_inner_amount, inner_work_amount); - // vector_tile is executed, but work_amount is neither set nor decremented appropriately. - } else if (vector_evaluate_once) { - h->mov(reg_inner_amount, inner_work_amount - vector_size); - } - // else: vector_tile is executed multiple times, so work_amount is already set - } - const bool skip_increments = outer_work_amount == 1 && inner_work_amount % vector_size == 1; - process_tile(scalar_evaluate_once, skip_increments, scalar_tile); - } -} - -void TileSchedulerEmitter::emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - if (jcp.is_static) - emit_static_impl(in, out, vec_pool, gpr_pool, emit_context); - else - emit_dynamic_impl(in, out, vec_pool, gpr_pool, emit_context); -} - -void TileSchedulerEmitter::emit_static_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - const size_t vector_size = in[2]; - const size_t num_params = num_inputs + num_outputs; - const auto& data_ptr_reg_idxs(out); - std::vector data_ptr_regs; - transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs); - // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool. - // we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter - auto local_gpr_pool = gpr_pool; - Reg64 reg_outer_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - Reg64 reg_inner_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - Label for_body; - const size_t outer_work_amount = jcp.scheduler_work_amounts[0]; - if (outer_work_amount == 1) { - // emit code directly without looping over external dim - emit_static_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); - } else if (outer_work_amount > 1) { - // We need to create a Loop in this case - h->mov(reg_outer_amount, outer_work_amount); - h->L(for_body); - { - emit_static_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); - - // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers - // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). - // To overcome this limitation, we add appropriate negative offsets if necessary. - for (auto i = 0; i < num_params; i++) { - if (jcp.scheduler_offsets[i] != 0) { - h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]); - } - } - // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar) - h->sub(reg_outer_amount, 1); - h->cmp(reg_outer_amount, 1); - h->jge(for_body, CodeGenerator::T_NEAR); - } - } -} - -std::vector& TileEmitter::get_nested_code() { - return body; -} - -void TileSchedulerEmitter::emit_dynamic_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - const size_t vector_size = in[2]; - const size_t num_params = num_inputs + num_outputs; - - const auto& data_ptr_reg_idxs(out); - std::vector data_ptr_regs(data_ptr_reg_idxs.size()); - std::transform(data_ptr_reg_idxs.begin(), data_ptr_reg_idxs.end(), data_ptr_regs.begin(), [](size_t idx){return Reg64(static_cast(idx));}); - - Reg64 reg_const_params = Reg64(static_cast(in[3])); - // todo: this limitation could be removed if we use Reg32 to store work_amounts (which is more than enough), - // since at least one Reg64 (reg_indexes spared in the Kernel) is guaranteed to be in the pool - if (gpr_pool.size() < 2) - IE_THROW() << "Dynamic Tile Scheduler needs at least two spare gpr regs to operate."; - auto local_gpr_pool = gpr_pool; - Reg64 reg_outer_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - Reg64 reg_inner_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - using TileAllocatedEmitter = std::pair, const ngraph::snippets::RegInfo&>; - TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast(body[0].first), body[0].second}; - TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast(body[1].first), body[1].second}; - auto emit_tiles = [&]() { - // the minimal requirement is that tile (vector or scalar) is emitted only if it has some work to do (>= 1 iterations) - auto process_tile = - [&](const size_t tile_increment, const TileAllocatedEmitter& tile) { - Label tile_end; - h->cmp(reg_inner_amount, tile_increment); - h->jl(tile_end, CodeGenerator::T_NEAR); - std::vector in_regs, out_regs; - std::tie(in_regs, out_regs) = tile.second; - // pass work_amount reg to Tile - in_regs.push_back(static_cast(reg_inner_amount.getIdx())); - in_regs.push_back(static_cast(reg_const_params.getIdx())); - for (const auto& reg : data_ptr_regs) - out_regs.emplace_back(reg.getIdx()); - tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool); - h->L(tile_end); - }; - h->mov(reg_inner_amount, h->ptr[reg_const_params + GET_OFF(scheduler_work_amounts) + sizeof(size_t)]); - process_tile(vector_size, vector_tile); - process_tile(1, scalar_tile); - }; - Label for_body, single_outer_tile, end; - { - h->mov(reg_outer_amount, h->ptr[reg_const_params + GET_OFF(scheduler_work_amounts)]); - // We don't need to apply scheduler offsets, or update reg_outer_amount in case of outer WA == 1 - h->cmp(reg_outer_amount, 1); - h->je(single_outer_tile, CodeGenerator::T_NEAR); - // - h->L(for_body); - { - emit_tiles(); - - // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers - // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). - // To overcome this limitation, we add appropriate negative offsets if necessary. - for (auto i = 0; i < num_params; i++) { - // NB! many scheduler offsets are zero - h->add(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(scheduler_offsets) + i * sizeof(int64_t)]); - } - // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar) - h->sub(reg_outer_amount, 1); - h->cmp(reg_outer_amount, 1); - h->jge(for_body, CodeGenerator::T_NEAR); - h->jmp(end, CodeGenerator::T_NEAR); - } - h->L(single_outer_tile); - { - // emit code directly without looping over external dim and applying scheduler offsets - emit_tiles(); - } - h->L(end); - } -} - -TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { - const auto tile = ov::as_type_ptr(n); - if (!tile) - IE_THROW() << "TileEmitter invoked with invalid op argument"; - body = tile->region; - if (body.empty()) - IE_THROW() << "TileEmitter is invoked with empty body"; - num_inputs = tile->num_inputs; - num_outputs = tile->num_outputs; - io_dims = tile->io_dims; - io_data_size = tile->io_data_size; - size_t num_dynamic_inputs = 0; - const bool has_dynamic_dims = std::any_of(io_dims.begin(), io_dims.end(), - [](size_t x) {return x == Subgraph::DYNAMIC_DIMENSION;}); - for (size_t i = 0; i < io_dims.size(); i ++) { - // If a last dim is static, but == 1 and there are some dynamic inputs as well, - // then treat the dim as dynamic, since we'll now whether it's broadcasted only at runtime - if (io_dims[i] == Subgraph::DYNAMIC_DIMENSION || (io_dims[i] == 1 && has_dynamic_dims)) { - dynamic_dims_idx.push_back(i); - if (i < num_inputs) - num_dynamic_inputs++; - } else { - static_dims_idx.push_back(i); - } - } - dynamic_increments.resize(dynamic_dims_idx.size()); - dynamic_broadcasting.resize(num_dynamic_inputs); - // zero in io_dims indicates dynamic dimension - increment = tile->increment; - if (io_dims.size() != num_inputs + num_outputs) - IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()"; -} - -void TileEmitter::emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); -} - -void TileEmitter::validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - //todo: if one of the uppermost dimensions is dynamic (batch for example), node is still considered to be dynamic - // and evaluates dynamic pipeline. Hence dynamic_dims_idx may be empty, but in.size() still == 2. Fix this in future. - // if ((dynamic_dims_idx.empty() && in.size() != 1) || (!dynamic_dims_idx.empty() && in.size() !=2)) - if (in.size() != 1 && in.size() !=2) - IE_THROW() << "TileEmitter got invalid number of inputs."; - if (out.size() != io_dims.size()) - IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size(); -} - -void TileEmitter::emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const { - for (auto& code : body) - code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool); -} - -void TileEmitter::emit_ptr_increments_static(const std::vector& data_ptr_regs) const { - // note that master_shape_last_dim could be equal to Subgraph::DYNAMIC_DIMENSION for dynamic case - auto master_shape_last_dim = *std::max_element(io_dims.begin(), io_dims.end()); - for (const auto& idx : static_dims_idx) { - // increment only inputs that are not broadcasted - if (io_dims[idx] != 1 || master_shape_last_dim == 1) - h->add(data_ptr_regs[idx], increment * io_data_size[idx]); - } -} - -void TileEmitter::emit_ptr_increments_dynamic(const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { - emit_ptr_increments_static(data_ptr_regs); - const size_t tile_type_offset = increment > 1 ? GET_OFF(vector_tile_increments) : GET_OFF(scalar_tile_increments); - for (size_t i = 0; i < dynamic_dims_idx.size(); i++) { - auto idx = dynamic_dims_idx[i]; - h->add(data_ptr_regs[idx], h->ptr[reg_const_params + tile_type_offset + idx * sizeof(int64_t)]); - } -} - -template -void TileEmitter::set_increments_and_broadcast_inputs(const Reg64& reg_const_params, const std::vector &data_ptr_regs) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - auto Vmm_tmp = Vmm(0); - for (size_t i = 0; i < dynamic_dims_idx.size(); i++) { - auto idx = dynamic_dims_idx[i]; - const auto& data_ptr_reg = data_ptr_regs[idx]; - // todo: we can store dynamic broadcasting info only for dynamic inputs (not for all, like we do now) - h->cmp(h->byte[reg_const_params + GET_OFF(broadcasting_mask) + idx * sizeof(bool)], 0); - Label no_broadcasting; - h->je(no_broadcasting, CodeGenerator::T_SHORT); - // Both inputs and outputs can be dynamic, but only inputs could be physically broadcasted - // Physical broadcasting is only required for vector tiles - if (idx < num_inputs && increment != 1) { - h->push(data_ptr_reg); - h->uni_vbroadcastss(Vmm_tmp, h->ptr[data_ptr_reg]); - h->mov(data_ptr_reg, h->ptr[reg_const_params + GET_OFF(broadcasting_scratchpad)]); - h->add(data_ptr_reg, i * increment * io_data_size[idx]); - // note that we use data_ptr_reg directly without h->rip - h->uni_vmovups(h->ptr[data_ptr_reg], Vmm_tmp); - } - h->L(no_broadcasting); - } -} - -void TileEmitter::cleanup_broadcasting(const Reg64& reg_const_params, const std::vector &data_ptr_regs) const { - if (increment == 1) - return; - for (int i = static_cast(dynamic_dims_idx.size()) - 1; i >= 0; i--) { - const auto& idx = dynamic_dims_idx[i]; - if (idx >= num_inputs) - continue; - // todo: we can store dynamic broadcasting info only for dynamic inputs (not for all, like we do now) - Label no_broadcasting; - h->cmp(h->byte[reg_const_params + GET_OFF(broadcasting_mask) + idx * sizeof(bool)], 0); - h->je(no_broadcasting, CodeGenerator::T_SHORT); - h->pop(data_ptr_regs[idx]); - h->L(no_broadcasting); - } -} - -void TileEmitter::emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - Reg64 work_amount = Reg64(static_cast(in[0])); - Reg64 reg_const_params; - // todo: unify interface for static & dynamic calls for TileEmitter? - // There is 1 arg for the static case, so we can assign any reg to reg_const_params, since it won't be really used. - // Anyway, try to assign a reg from the pool to prevent possible work_amount corruption - if (dynamic_dims_idx.empty()) { - reg_const_params = gpr_pool.empty() ? work_amount : Reg64(gpr_pool.back()); - } else { - reg_const_params = Reg64(static_cast(in[1])); - } - std::vector data_ptr_regs; - transform_idxs_to_regs(out, data_ptr_regs); - switch (host_isa_) { - case dnnl::impl::cpu::x64::sse41: - set_increments_and_broadcast_inputs(reg_const_params, data_ptr_regs); - break; - case dnnl::impl::cpu::x64::avx2: - set_increments_and_broadcast_inputs(reg_const_params, data_ptr_regs); - break; - case dnnl::impl::cpu::x64::avx512_core: - set_increments_and_broadcast_inputs(reg_const_params, data_ptr_regs); - break; - default: - IE_THROW() << "unsupported isa: " << host_isa_; - } - Label for_body; - // Note that: - // * Work amount must be set by TileScheduler that executes Tiles - // * TileScheduler executes Tile only if it has to perform >= 1 iterations - h->L(for_body); - emit_body(vec_pool, gpr_pool); - emit_ptr_increments_dynamic(reg_const_params, data_ptr_regs); - h->sub(work_amount, increment); - h->cmp(work_amount, increment); - h->jge(for_body, CodeGenerator::T_NEAR); - cleanup_broadcasting(reg_const_params, data_ptr_regs); -} TileBeginEmitter::TileBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : jit_emitter(h, isa, n) { @@ -710,37 +285,10 @@ TileEndEmitter::TileEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::imp finalization_offsets = tile_end->get_finalization_offsets(); evaluate_once = tile_end->get_evaluate_once(); reuse_work_amount_reg = tile_end->reuse_work_amount_reg; - // dim_idx must be equal for TileBegin and TileEnd - const auto dim_idx = tile_end->get_dimension(); - // todo: add checks on work_amount vs increment consistency + checks on work_amount vs max(last_dim) consistence - // probably better to implement them in Tile* constructors - for (int i = 0; i < num_inputs; i++) { - // todo: we can take the whole partial shape in future, since it should contain only one dim - const auto& relevant_dim = tile_begin->get_input_partial_shape(i)[dim_idx]; - io_dims.push_back(relevant_dim.is_static() ? relevant_dim.get_length() : Subgraph::DYNAMIC_DIMENSION); + for (int i = 0; i < num_inputs; i++) io_data_size.push_back(tile_begin->get_input_element_type(i).size()); - } - for (int i = 0; i < num_outputs; i++) { - // todo: we can take the whole partial shape in future, since it should contain only one dim - const auto& relevant_dim = tile_end->get_output_partial_shape(i)[dim_idx]; - io_dims.push_back(relevant_dim.is_static() ? relevant_dim.get_length() : Subgraph::DYNAMIC_DIMENSION); + for (int i = 0; i < num_outputs; i++) io_data_size.push_back(tile_end->get_input_element_type(i).size()); - } - size_t num_dynamic_inputs = 0; - const bool has_dynamic_dims = std::any_of(io_dims.begin(), io_dims.end(), [](size_t x) {return x == Subgraph::DYNAMIC_DIMENSION;}); - for (size_t i = 0; i < io_dims.size(); i ++) { - // If a last dim is static, but == 1 and there are some dynamic inputs as well, - // then treat the dim as dynamic, since we'll now whether it's broadcasted only at runtime - if (io_dims[i] == Subgraph::DYNAMIC_DIMENSION || (io_dims[i] == 1 && has_dynamic_dims)) { - dynamic_dims_idx.push_back(i); - if (i < num_inputs) - num_dynamic_inputs++; - } else { - static_dims_idx.push_back(i); - } - } - dynamic_increments.resize(dynamic_dims_idx.size()); - dynamic_broadcasting.resize(num_dynamic_inputs); in_out_type_ = emitter_in_out_map::gpr_to_gpr; } @@ -797,7 +345,6 @@ void TileEndEmitter::emit_impl(const std::vector& in, // restore reg state if we've changed it before h->pop(reg_work_amount); } -// cleanup_broadcasting(reg_const_params, data_ptr_regs); } BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index 01bed77257e6a9..37225fbb1c76bc 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -28,20 +28,10 @@ namespace intel_cpu { struct jit_snippets_call_args { const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; - int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; - size_t scheduler_work_amounts[SNIPPETS_MAX_TILE_RANK] = {}; - int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {}; - float* broadcasting_scratchpad = nullptr; - bool broadcasting_mask[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; // bit is set if broadcasting over this io takes place - int64_t vector_tile_increments[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; - int64_t scalar_tile_increments[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; }; struct jit_snippets_compile_args { - bool is_static = true; std::vector master_shape{}; - int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; - size_t scheduler_work_amounts[SNIPPETS_MAX_TILE_RANK] = {}; int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {}; }; /// @@ -104,101 +94,6 @@ class KernelEmitter : public jit_container_emitter { std::vector gp_regs_used; std::vector vec_regs_pool; }; -/// -/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets -/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector -/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required. -/// -/// \param in[0] The number of the node inputs -/// \param in[1] The number of the node outputs -/// \param in[2] The number of elements that fits into vector register -/// - -class TileSchedulerEmitter : public jit_container_emitter { -public: - TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n); - - size_t get_inputs_num() const override {return 0;} - void emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const override; - -private: - void validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const override; - void emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override; - - void emit_static_tiles(const Reg64&, const std::vector&, size_t, const std::vector& , const std::vector&) const; - - void emit_static_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const; - - void emit_dynamic_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const; - - jit_snippets_compile_args jcp; -}; - -/// -/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop: -/// it performs operations specified by enclosed emitters, advances iteration counters -/// and breaks when necessary. -/// -/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile. -/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. -class TileEmitter : public jit_container_emitter { -public: - TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); - - size_t get_inputs_num() const override {return 0;} - std::vector& get_nested_code(); - void emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const override; - - void emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const; - void emit_ptr_increments_static(const std::vector& data_ptr_regs) const; - void emit_ptr_increments_dynamic(const Reg64& reg_const_params, const std::vector& data_ptr_regs) const; - template - void set_increments_and_broadcast_inputs(const Reg64& reg_const_params, const std::vector &data_ptr_regs) const; - void cleanup_broadcasting(const Reg64& reg_const_params, const std::vector &data_ptr_regs) const; - -private: - void validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const override; - void emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override; - - size_t num_inputs = 0; - size_t num_outputs = 0; - std::vector io_dims {}; - std::vector io_data_size {}; - size_t increment = 0; - std::vector static_dims_idx {}; // non-zero io_dims indexes == dims that are not broadcasted - std::vector dynamic_dims_idx {}; // non-zero io_dims indexes == dims that are not broadcasted - mutable std::vector