diff --git a/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp b/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp new file mode 100644 index 00000000000000..9ac4181e61e861 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/optimize_loop_single_evaluation.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +/** + * @interface OptimizeLoopSingleEvaluation + * @brief Does the following optimizations if the Loop body can be executed only once: + * - sets evaluate_once parameter to true + * - moves all ptr arithmetic to finalization offsets + * @ingroup snippets + */ +class OptimizeLoopSingleEvaluation : public Pass { +public: + OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "Pass") + bool run(LinearIR& linear_ir) override; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 1d1d733277f99b..c8386ffe8e2f90 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -6,7 +6,9 @@ #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/assign_registers.hpp" +#include "snippets/lowered/pass/cleanup_loop_offsets.hpp" #include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" #include "snippets/op/kernel.hpp" @@ -27,6 +29,8 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons lowered::pass::PassPipeline lowered_pipeline; lowered_pipeline.register_pass(reg_type_mapper); lowered_pipeline.register_pass(); + lowered_pipeline.register_pass(); + lowered_pipeline.register_pass(); lowered_pipeline.run(linear_ir); linear_ir.init_emitters(target); diff --git a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp index 177b4c66eb6e70..775a48bad1893e 100644 --- a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp +++ b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp @@ -42,10 +42,6 @@ bool CleanupLoopOffsets::run(LinearIR& linear_ir) { per_port_connector_offset[loop_inputs[i]] = i; const auto outer_increment = static_cast(outer_loop_end->get_increment()); - const auto work_amount = outer_loop_end->get_work_amount(); - if (work_amount % outer_increment != 0) { - continue; - } auto outer_ptr_increments = outer_loop_end->get_ptr_increments(); const auto& outer_is_incremented = outer_loop_end->get_is_incremented(); const auto& outer_loop_inputs = next_expr_it->get()->get_input_port_connectors(); diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp index e9c7495f823fe2..8bf7929d82006c 100644 --- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp +++ b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp @@ -333,28 +333,6 @@ void InsertTailLoop::tail_transformations(LinearIR& linear_ir, } } -bool InsertTailLoop::optimize_single_evaluation(const std::shared_ptr& loop) { - // *1* solo vector/tail loop + empty outer loop - // => skip increments (both counter & ptr) : set evaluate_once flag - // *2* solo vector/tail loop + non-empty outer loop - // => skip counter increments but perform ptr increments : set evaluate_once, - // and perform pointer increments through finalization offsets - // *3* vector loop(s) + one tail loop - // => vector as usual, tail depends on outer loop, see *1* and *2* - if (loop->get_work_amount() >= 2 * loop->get_increment()) - return false; - - auto new_finalization_offsets = loop->get_finalization_offsets(); - const auto& ptr_increments = loop->get_ptr_increments(); - const auto work_amount_incr = static_cast(loop->get_increment()); - for (size_t i = 0; i < new_finalization_offsets.size(); i++) { - new_finalization_offsets[i] += ptr_increments[i] * work_amount_incr; - } - loop->set_finalization_offsets(new_finalization_offsets); - loop->set_evaluate_once(true); - return true; -} - bool InsertTailLoop::run(LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::insertTailLoop") const auto& loop_manager = linear_ir.get_loop_manager(); @@ -381,18 +359,6 @@ bool InsertTailLoop::run(LinearIR& linear_ir) { const auto tail_size = work_amount % increment; const auto need_tail = tail_size != 0; const auto need_vector_loop = work_amount >= increment; - // Note, that finalization_offsets could be modified inside optimize_single_evaluation, - // so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail) - const auto tail_finalization_offsets = need_tail ? loop_end->get_finalization_offsets() : std::vector{}; - // vector loops are required => Just copy the body, original loop is already a vector one - if (need_vector_loop) { - // Note that finalization offsets should be applied after the last iteration. - // So if there is a tail, then we should apply offsets after it, but not now. - if (need_tail) - loop_end->set_finalization_offsets(std::vector(tail_finalization_offsets.size(), 0)); - - optimize_single_evaluation(loop_end); - } // tail is required => transform the body into a tail representation // tail loop is fake loop because for tail we should calculate only @@ -401,9 +367,14 @@ bool InsertTailLoop::run(LinearIR& linear_ir) { const auto loop_begin = loop_end->get_loop_begin(); const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_begin)); LinearIR::constExprIt tail_begin, tail_end; + const auto& finalization_offsets = loop_end->get_finalization_offsets(); const auto tail_loop_end = create_tail_loop(linear_ir, begin_it, std::next(expr_it), tail_begin, tail_end, - loop_end, need_vector_loop, tail_size, tail_finalization_offsets); - optimize_single_evaluation(tail_loop_end); + loop_end, need_vector_loop, tail_size, finalization_offsets); + if (need_vector_loop) { + // Note that finalization offsets should be applied after the last iteration. + // So if there is a tail, then we should apply offsets after it. + loop_end->set_finalization_offsets(std::vector(finalization_offsets.size(), 0)); + } // Skip new tail loop. Note: tail_end refs to the next expression after LoopEnd of tail expr_it = std::prev(tail_end); } diff --git a/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp new file mode 100644 index 00000000000000..4244c09c7e658c --- /dev/null +++ b/src/common/snippets/src/lowered/pass/optimize_loop_single_evaluation.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +bool OptimizeLoopSingleEvaluation::run(LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::OptimizeLoopSingleEvaluation") + if (linear_ir.empty()) + return false; + + bool is_modified = false; + for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { + if (auto loop_end = ov::as_type_ptr(expr_it->get()->get_node())) { + // *1* solo vector/tail loop + empty outer loop + // => skip increments (both counter & ptr) : set evaluate_once flag + // *2* solo vector/tail loop + non-empty outer loop + // => skip counter increments but perform ptr increments : set evaluate_once, + // and perform pointer increments through finalization offsets + // *3* vector loop(s) + one tail loop + // => vector as usual, tail depends on outer loop, see *1* and *2* + if (loop_end->get_work_amount() >= 2 * loop_end->get_increment()) + continue; + + auto new_finalization_offsets = loop_end->get_finalization_offsets(); + const auto& ptr_increments = loop_end->get_ptr_increments(); + const auto work_amount_incr = static_cast(loop_end->get_increment()); + for (size_t i = 0; i < new_finalization_offsets.size(); i++) { + new_finalization_offsets[i] += ptr_increments[i] * work_amount_incr; + } + loop_end->set_finalization_offsets(new_finalization_offsets); + loop_end->set_ptr_increments(std::vector(new_finalization_offsets.size(), 0)); + loop_end->set_evaluate_once(true); + is_modified = true; + } + } + return is_modified; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov + diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 83a200df2e8a5c..bbd55ebb29788a 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -32,7 +32,6 @@ #include "snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp" #include "snippets/lowered/pass/allocate_buffers.hpp" #include "snippets/lowered/pass/propagate_layout.hpp" -#include "snippets/lowered/pass/cleanup_loop_offsets.hpp" #include "snippets/lowered/pass/softmax_decomposition.hpp" #include "snippets/lowered/pass/move_scalar_to_consumer.hpp" #include "snippets/lowered/pass/move_result_out_of_loop.hpp" @@ -686,7 +685,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, lowered::pass::PassPipeline final_pipeline; final_pipeline.register_pass(); - final_pipeline.register_pass(); final_pipeline.run(linear_ir); m_buffer_scratchpad = buffer_allocation_pass->get_scratchpad_size(); diff --git a/src/common/snippets/tests/src/pass/lowered/loop.cpp b/src/common/snippets/tests/src/pass/lowered/loop.cpp index 60091bc9f81e7b..15865690255d30 100644 --- a/src/common/snippets/tests/src/pass/lowered/loop.cpp +++ b/src/common/snippets/tests/src/pass/lowered/loop.cpp @@ -2,21 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "snippets/op/loop.hpp" -#include "subgraph_simple.hpp" +#include #include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/cleanup_loop_offsets.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_load_store.hpp" -#include "snippets/lowered/pass/cleanup_loop_offsets.hpp" -#include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/insert_tail_loop.hpp" +#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp" +#include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/shape_inference/shape_inference.hpp" - -#include "snippets/op/loop.hpp" - +#include "subgraph_simple.hpp" using Snippets_TailProcessingTransformation = ::testing::Test; // [Inserted Loop number, [ptr_increments, final_offsets] @@ -120,19 +119,20 @@ TEST(Snippets_TailProcessingTransformation, BlockedTail_OriginalPtrShifts) { pass::PassPipeline pass_pipeline; init_pipeline(pass_pipeline); pass_pipeline.register_pass(); + pass_pipeline.register_pass(); pass_pipeline.run(linear_ir); // [Inserted Loop number, [ptr_increments, final_offsets] std::map, std::vector>> reference; - reference[0] = { std::vector(3, 1), std::vector(3, 16)}; // Vector Inner - reference[1] = { std::vector(3, 1), std::vector(3, -16)}; // Blocked Inner + reference[0] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner + reference[1] = { std::vector(3, 0), std::vector(3, -16)}; // Blocked Inner reference[2] = { std::vector(3, 20), std::vector(3, -80)}; // Vector Blocked reference[3] = { std::vector(3, 20), std::vector(3, 0)}; // Vector Outer - reference[4] = { std::vector(3, 1), std::vector(3, 16)}; // Vector Inner - reference[5] = { std::vector(3, 1), std::vector(3, -16)}; // Blocked Inner + reference[4] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner + reference[5] = { std::vector(3, 0), std::vector(3, -16)}; // Blocked Inner reference[6] = { std::vector(3, 20), std::vector(3, -40)}; // Tail Blocked - reference[7] = { std::vector(3, 20), std::vector(3, -320)}; // Tail Blocked + reference[7] = { std::vector(3, 0), std::vector(3, -320)}; // Tail Blocked validate(linear_ir, reference); } @@ -145,21 +145,23 @@ TEST(Snippets_TailProcessingTransformation, BlockedTail_CleanUpPtrShifts) { pass::PassPipeline pass_pipeline; init_pipeline(pass_pipeline); - pass_pipeline.register_pass(); pass_pipeline.register_pass(); + pass_pipeline.register_pass(); + pass_pipeline.register_pass(); pass_pipeline.run(linear_ir); // [Inserted Loop number, [ptr_increments, final_offsets] std::map, std::vector>> reference; - reference[0] = { std::vector(3, 1), std::vector(3, 16)}; // Vector Inner - reference[1] = { std::vector(3, 1), std::vector(3, 4)}; // Blocked Inner - reference[2] = { std::vector(3, 0), std::vector(3, -80)}; // Vector Blocked - reference[3] = { std::vector(3, 20), std::vector(3, 0)}; // Vector Outer - - reference[4] = { std::vector(3, 1), std::vector(3, 16)}; // Vector Inner - reference[5] = { std::vector(3, 1), std::vector(3, 4)}; // Blocked Inner - reference[6] = { std::vector(3, 0), std::vector(3, -40)}; // Tail Blocked - reference[7] = { std::vector(3, 20), std::vector(3, 40)}; // Tail Blocked + reference[0] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner + // TODO: fix behavior with LoopEnd's port connectors + reference[1] = { std::vector(3, 0), std::vector{4, 4, -16}}; // Blocked Inner + reference[2] = {std::vector{0, 0, 20}, std::vector(3, 0)}; // Vector Blocked + reference[3] = { std::vector(3, 0), std::vector(3, 0)}; // Vector Outer + + reference[4] = { std::vector(3, 0), std::vector(3, 16)}; // Vector Inner + reference[5] = { std::vector(3, 0), std::vector{4, 4, -16}}; // Blocked Inner + reference[6] = { std::vector{0, 0, 20}, std::vector(3, 0)}; // Tail Blocked + reference[7] = { std::vector(3, 0), std::vector(3, 0)}; // Tail Blocked validate(linear_ir, reference); } \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index 4b1d459c1c08f3..2591c28ab41fc8 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -154,7 +154,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) { loop_info->work_amount = increment; loop_end->set_work_amount(increment); loop_end->set_finalization_offsets(std::vector(loop_end->get_finalization_offsets().size(), 0)); - snippets::lowered::pass::InsertTailLoop::optimize_single_evaluation(loop_end); const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(new_loop_end->get_loop_begin())); const auto end_it = linear_ir.find(linear_ir.get_expr_by_node(new_loop_end)); snippets::lowered::pass::InsertTailLoop::propagate_updated_subtensor_through_loop(