Skip to content

Commit

Permalink
[LIR] OptimizeLoopSingleEvaluation moved to a separate pass
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Sep 29, 2023
1 parent 66d06d1 commit 3e052a2
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 65 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface OptimizeLoopSingleEvaluation
* @brief Does the following optimizations if the Loop body can be executed only once:
* - sets evaluate_once parameter to true
* - moves all ptr arithmetic to finalization offsets
* @ingroup snippets
*/
class OptimizeLoopSingleEvaluation : public Pass {
public:
OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "Pass")
bool run(LinearIR& linear_ir) override;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
4 changes: 4 additions & 0 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/assign_registers.hpp"
#include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
#include "snippets/lowered/pass/insert_tail_loop.hpp"
#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"

#include "snippets/op/kernel.hpp"

Expand All @@ -27,6 +29,8 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons
lowered::pass::PassPipeline lowered_pipeline;
lowered_pipeline.register_pass<lowered::pass::AssignRegisters>(reg_type_mapper);
lowered_pipeline.register_pass<lowered::pass::InsertTailLoop>();
lowered_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
lowered_pipeline.register_pass<lowered::pass::OptimizeLoopSingleEvaluation>();
lowered_pipeline.run(linear_ir);

linear_ir.init_emitters(target);
Expand Down
4 changes: 0 additions & 4 deletions src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ bool CleanupLoopOffsets::run(LinearIR& linear_ir) {
per_port_connector_offset[loop_inputs[i]] = i;

const auto outer_increment = static_cast<int64_t>(outer_loop_end->get_increment());
const auto work_amount = outer_loop_end->get_work_amount();
if (work_amount % outer_increment != 0) {
continue;
}
auto outer_ptr_increments = outer_loop_end->get_ptr_increments();
const auto& outer_is_incremented = outer_loop_end->get_is_incremented();
const auto& outer_loop_inputs = next_expr_it->get()->get_input_port_connectors();
Expand Down
43 changes: 7 additions & 36 deletions src/common/snippets/src/lowered/pass/insert_tail_loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,28 +333,6 @@ void InsertTailLoop::tail_transformations(LinearIR& linear_ir,
}
}

bool InsertTailLoop::optimize_single_evaluation(const std::shared_ptr<op::LoopEnd>& loop) {
// *1* solo vector/tail loop + empty outer loop
// => skip increments (both counter & ptr) : set evaluate_once flag
// *2* solo vector/tail loop + non-empty outer loop
// => skip counter increments but perform ptr increments : set evaluate_once,
// and perform pointer increments through finalization offsets
// *3* vector loop(s) + one tail loop
// => vector as usual, tail depends on outer loop, see *1* and *2*
if (loop->get_work_amount() >= 2 * loop->get_increment())
return false;

auto new_finalization_offsets = loop->get_finalization_offsets();
const auto& ptr_increments = loop->get_ptr_increments();
const auto work_amount_incr = static_cast<int64_t>(loop->get_increment());
for (size_t i = 0; i < new_finalization_offsets.size(); i++) {
new_finalization_offsets[i] += ptr_increments[i] * work_amount_incr;
}
loop->set_finalization_offsets(new_finalization_offsets);
loop->set_evaluate_once(true);
return true;
}

bool InsertTailLoop::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::insertTailLoop")
const auto& loop_manager = linear_ir.get_loop_manager();
Expand All @@ -381,18 +359,6 @@ bool InsertTailLoop::run(LinearIR& linear_ir) {
const auto tail_size = work_amount % increment;
const auto need_tail = tail_size != 0;
const auto need_vector_loop = work_amount >= increment;
// Note, that finalization_offsets could be modified inside optimize_single_evaluation,
// so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail)
const auto tail_finalization_offsets = need_tail ? loop_end->get_finalization_offsets() : std::vector<int64_t>{};
// vector loops are required => Just copy the body, original loop is already a vector one
if (need_vector_loop) {
// Note that finalization offsets should be applied after the last iteration.
// So if there is a tail, then we should apply offsets after it, but not now.
if (need_tail)
loop_end->set_finalization_offsets(std::vector<int64_t>(tail_finalization_offsets.size(), 0));

optimize_single_evaluation(loop_end);
}

// tail is required => transform the body into a tail representation
// tail loop is fake loop because for tail we should calculate only
Expand All @@ -401,9 +367,14 @@ bool InsertTailLoop::run(LinearIR& linear_ir) {
const auto loop_begin = loop_end->get_loop_begin();
const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(loop_begin));
LinearIR::constExprIt tail_begin, tail_end;
const auto& finalization_offsets = loop_end->get_finalization_offsets();
const auto tail_loop_end = create_tail_loop(linear_ir, begin_it, std::next(expr_it), tail_begin, tail_end,
loop_end, need_vector_loop, tail_size, tail_finalization_offsets);
optimize_single_evaluation(tail_loop_end);
loop_end, need_vector_loop, tail_size, finalization_offsets);
if (need_vector_loop) {
// Note that finalization offsets should be applied after the last iteration.
// So if there is a tail, then we should apply offsets after it.
loop_end->set_finalization_offsets(std::vector<int64_t>(finalization_offsets.size(), 0));
}
// Skip new tail loop. Note: tail_end refs to the next expression after LoopEnd of tail
expr_it = std::prev(tail_end);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/itt.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

bool OptimizeLoopSingleEvaluation::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::OptimizeLoopSingleEvaluation")
if (linear_ir.empty())
return false;

bool is_modified = false;
for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
if (auto loop_end = ov::as_type_ptr<op::LoopEnd>(expr_it->get()->get_node())) {
// *1* solo vector/tail loop + empty outer loop
// => skip increments (both counter & ptr) : set evaluate_once flag
// *2* solo vector/tail loop + non-empty outer loop
// => skip counter increments but perform ptr increments : set evaluate_once,
// and perform pointer increments through finalization offsets
// *3* vector loop(s) + one tail loop
// => vector as usual, tail depends on outer loop, see *1* and *2*
if (loop_end->get_work_amount() >= 2 * loop_end->get_increment())
continue;

auto new_finalization_offsets = loop_end->get_finalization_offsets();
const auto& ptr_increments = loop_end->get_ptr_increments();
const auto work_amount_incr = static_cast<int64_t>(loop_end->get_increment());
for (size_t i = 0; i < new_finalization_offsets.size(); i++) {
new_finalization_offsets[i] += ptr_increments[i] * work_amount_incr;
}
loop_end->set_finalization_offsets(new_finalization_offsets);
loop_end->set_ptr_increments(std::vector<int64_t>(new_finalization_offsets.size(), 0));
loop_end->set_evaluate_once(true);
is_modified = true;
}
}
return is_modified;
}

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

2 changes: 0 additions & 2 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
#include "snippets/lowered/pass/load_movebroadcast_to_broadcastload.hpp"
#include "snippets/lowered/pass/allocate_buffers.hpp"
#include "snippets/lowered/pass/propagate_layout.hpp"
#include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
#include "snippets/lowered/pass/softmax_decomposition.hpp"
#include "snippets/lowered/pass/move_scalar_to_consumer.hpp"
#include "snippets/lowered/pass/move_result_out_of_loop.hpp"
Expand Down Expand Up @@ -686,7 +685,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,

lowered::pass::PassPipeline final_pipeline;
final_pipeline.register_pass<lowered::pass::PropagateLayout>();
final_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
final_pipeline.run(linear_ir);

m_buffer_scratchpad = buffer_allocation_pass->get_scratchpad_size();
Expand Down
46 changes: 24 additions & 22 deletions src/common/snippets/tests/src/pass/lowered/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,20 @@
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>
#include "snippets/op/loop.hpp"

#include "subgraph_simple.hpp"
#include <gtest/gtest.h>

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
#include "snippets/lowered/pass/init_loops.hpp"
#include "snippets/lowered/pass/insert_load_store.hpp"
#include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
#include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/insert_tail_loop.hpp"
#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"
#include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

#include "snippets/op/loop.hpp"

#include "subgraph_simple.hpp"

using Snippets_TailProcessingTransformation = ::testing::Test;
// [Inserted Loop number, [ptr_increments, final_offsets]
Expand Down Expand Up @@ -120,19 +119,20 @@ TEST(Snippets_TailProcessingTransformation, BlockedTail_OriginalPtrShifts) {
pass::PassPipeline pass_pipeline;
init_pipeline(pass_pipeline);
pass_pipeline.register_pass<pass::InsertTailLoop>();
pass_pipeline.register_pass<pass::OptimizeLoopSingleEvaluation>();
pass_pipeline.run(linear_ir);

// [Inserted Loop number, [ptr_increments, final_offsets]
std::map<size_t, std::pair<std::vector<int64_t>, std::vector<int64_t>>> reference;
reference[0] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[1] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, -16)}; // Blocked Inner
reference[0] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[1] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, -16)}; // Blocked Inner
reference[2] = { std::vector<int64_t>(3, 20), std::vector<int64_t>(3, -80)}; // Vector Blocked
reference[3] = { std::vector<int64_t>(3, 20), std::vector<int64_t>(3, 0)}; // Vector Outer

reference[4] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[5] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, -16)}; // Blocked Inner
reference[4] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[5] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, -16)}; // Blocked Inner
reference[6] = { std::vector<int64_t>(3, 20), std::vector<int64_t>(3, -40)}; // Tail Blocked
reference[7] = { std::vector<int64_t>(3, 20), std::vector<int64_t>(3, -320)}; // Tail Blocked
reference[7] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, -320)}; // Tail Blocked

validate(linear_ir, reference);
}
Expand All @@ -145,21 +145,23 @@ TEST(Snippets_TailProcessingTransformation, BlockedTail_CleanUpPtrShifts) {

pass::PassPipeline pass_pipeline;
init_pipeline(pass_pipeline);
pass_pipeline.register_pass<pass::CleanupLoopOffsets>();
pass_pipeline.register_pass<pass::InsertTailLoop>();
pass_pipeline.register_pass<pass::CleanupLoopOffsets>();
pass_pipeline.register_pass<pass::OptimizeLoopSingleEvaluation>();
pass_pipeline.run(linear_ir);

// [Inserted Loop number, [ptr_increments, final_offsets]
std::map<size_t, std::pair<std::vector<int64_t>, std::vector<int64_t>>> reference;
reference[0] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[1] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, 4)}; // Blocked Inner
reference[2] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, -80)}; // Vector Blocked
reference[3] = { std::vector<int64_t>(3, 20), std::vector<int64_t>(3, 0)}; // Vector Outer

reference[4] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[5] = { std::vector<int64_t>(3, 1), std::vector<int64_t>(3, 4)}; // Blocked Inner
reference[6] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, -40)}; // Tail Blocked
reference[7] = { std::vector<int64_t>(3, 20), std::vector<int64_t>(3, 40)}; // Tail Blocked
reference[0] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, 16)}; // Vector Inner
// TODO: fix behavior with LoopEnd's port connectors
reference[1] = { std::vector<int64_t>(3, 0), std::vector<int64_t>{4, 4, -16}}; // Blocked Inner
reference[2] = {std::vector<int64_t>{0, 0, 20}, std::vector<int64_t>(3, 0)}; // Vector Blocked
reference[3] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, 0)}; // Vector Outer

reference[4] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, 16)}; // Vector Inner
reference[5] = { std::vector<int64_t>(3, 0), std::vector<int64_t>{4, 4, -16}}; // Blocked Inner
reference[6] = { std::vector<int64_t>{0, 0, 20}, std::vector<int64_t>(3, 0)}; // Tail Blocked
reference[7] = { std::vector<int64_t>(3, 0), std::vector<int64_t>(3, 0)}; // Tail Blocked

validate(linear_ir, reference);
}
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ bool BrgemmBlocking::run(LinearIR& linear_ir) {
loop_info->work_amount = increment;
loop_end->set_work_amount(increment);
loop_end->set_finalization_offsets(std::vector<int64_t>(loop_end->get_finalization_offsets().size(), 0));
snippets::lowered::pass::InsertTailLoop::optimize_single_evaluation(loop_end);
const auto begin_it = linear_ir.find(linear_ir.get_expr_by_node(new_loop_end->get_loop_begin()));
const auto end_it = linear_ir.find(linear_ir.get_expr_by_node(new_loop_end));
snippets::lowered::pass::InsertTailLoop::propagate_updated_subtensor_through_loop(
Expand Down

0 comments on commit 3e052a2

Please sign in to comment.