Skip to content

Commit

Permalink
[GPU] Use ngraph Transpose+MatMul fusion instead of custom code in la…
Browse files Browse the repository at this point in the history
…yout optimizer (openvinotoolkit#25708)

### Details:
 - Allow Tranpose+Matmul+[Transpose] fusion for static shapes
- Allow Transpose in `MoveEltwiseUpThroughDataMov` for specific
Transpose -> Eltwise -> MatMul case
- Change order of `MoveEltwiseUpThroughDataMov` and
`ConvertMatMulToFullyConnected` to simplify callback
- Removed custom code for similar fusion in layout optimizer and related
debug knob
  • Loading branch information
vladimir-paramuzov authored Aug 29, 2024
1 parent fcd1138 commit 1f9a6f8
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 328 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ class debug_configuration {
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
int disable_onednn_permute_fusion; // Disable permute fusion for onednn ops
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
int dump_profiling_data_per_iter; // Enables dump of extended performance profiling to specified dir for each iteration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@ void compile_graph::run(program& p) {
auto& proc_order = p.get_processing_order();
std::vector<ov::threading::Task> tasks;
std::exception_ptr exception;
bool disable_permute_fuse_onednn_gemm = false;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1)
disable_permute_fuse_onednn_gemm = true;

for (size_t idx = 0; idx < proc_order.size(); idx++) {
auto& node = *(std::next(proc_order.begin(), idx));
Expand All @@ -57,7 +53,7 @@ void compile_graph::run(program& p) {
// Do not change impl (i.e. do not use ocl shape-agnostic kernels)
// since oneDNN primitives/kernels caching mechanism will be used instead.
change_initial_impl = false;
} else if (node->is_type<gemm>() && !disable_permute_fuse_onednn_gemm) {
} else if (node->is_type<gemm>()) {
// permute is fused to onednn gemm. The updated memory formats are not supported by ocl this keep onednn impl
for (const auto& dep : node->get_dependencies()) {
if (dep.first->is_type<permute>() && dep.first->can_be_optimized() && !dep.first->is_runtime_skippable() &&
Expand Down
63 changes: 0 additions & 63 deletions src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2044,69 +2044,6 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(target_format) << " --> " << fmt_to_str(target_format)
<< " For index : " << idx << std::endl;
}
bool disable_permute_fuse_onednn_gemm = false;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1)
disable_permute_fuse_onednn_gemm = true;
// Optimized out permute from permute-gemm pattern. i.e. permute -> gemm
if (node.is_type<gemm>() && !disable_permute_fuse_onednn_gemm && node.get_program().get_config().get_property(ov::intel_gpu::optimize_data)) {
// Only the formats below support permute opt out in gemm and permute pattern. For other formats, need to check the gemm performance.
for (size_t idx = 0 ; idx < node.get_dependencies().size() ; idx++) {
if (node.get_dependency(idx).is_type<permute>()) {
auto& pnode = node.get_dependency(idx);
if (pnode.has_fused_primitives()) {
continue;
}
auto input_lay = pnode.get_dependency(0).get_output_layout();
auto output_lay = pnode.get_output_layout();
bool can_fuse_permute = input_lay.compatible(output_lay) ||
((input_lay.is_dynamic() || output_lay.is_dynamic()) &&
format::is_default_format(input_lay.format) &&
format::is_default_format(output_lay.format) && pnode.get_users().size() == 1);
const auto& permute_order = pnode.get_kernel_impl_params()->typed_desc<permute>()->permute_order;
std::vector<size_t> order(std::begin(permute_order), std::end(permute_order));
format fmt = format::bfyx;
if (can_fuse_permute && gemm_inst::is_fusable_permute_input_order_onednn(order, fmt)) {
pnode.init_preferred_fmt(1, 1);
pnode.set_preferred_output_fmt(0, format(static_cast<format::type>(fmt)));
pnode.can_be_optimized(true);
node.set_preferred_input_fmt(idx, format(static_cast<format::type>(fmt)));
GPU_DEBUG_TRACE_DETAIL << pnode.id() << " is fused to onednn gemm user : " << node.id() << std::endl;
GPU_DEBUG_TRACE_DETAIL << " permute order : ";
GPU_DEBUG_CODE(for (const auto& o : permute_order) GPU_DEBUG_TRACE_DETAIL << o << " "; GPU_DEBUG_TRACE_DETAIL << std::endl;)
}
}
}
// gemm -> permute
if (node.get_users().size() == 1 && node.get_users().front()->is_type<permute>() && !node.has_fused_primitives()) {
auto& pnode = node.get_users().front()->as<permute>();
if (!pnode.has_fused_primitives()) {
auto input_lay = pnode.get_dependency(0).get_output_layout();
auto output_lay = pnode.get_output_layout();
bool can_fuse_permute = input_lay.compatible(output_lay) ||
((input_lay.is_dynamic() || output_lay.is_dynamic()) &&
format::is_default_format(input_lay.format) &&
format::is_default_format(output_lay.format) && pnode.get_users().size() == 1);
format fmt = format::bfyx;
auto impl_param = pnode.get_kernel_impl_params();
auto desc = impl_param->typed_desc<permute>();
auto permute_order = desc->permute_order;
std::vector<size_t> order(std::begin(permute_order), std::end(permute_order));
if (can_fuse_permute && gemm_inst::is_fusable_permute_output_order_onednn(order, fmt)) {
node.set_preferred_output_fmt(0, format(static_cast<format::type>(fmt)));
pnode.init_preferred_fmt(1, 1);
pnode.set_preferred_input_fmt(0, format(static_cast<format::type>(fmt)));
// tmp :: to fix
format out_fmt = format::bfyx;
pnode.set_preferred_output_fmt(0, format(static_cast<format::type>(out_fmt)));
pnode.can_be_optimized(true);
GPU_DEBUG_TRACE_DETAIL << pnode.id() << " is fused to onednn gemm pred : " << node.id() << std::endl;
GPU_DEBUG_TRACE_DETAIL << " permute order : ";
GPU_DEBUG_CODE(for (const auto& o : permute_order) GPU_DEBUG_TRACE_DETAIL << o << " "; GPU_DEBUG_TRACE_DETAIL << std::endl;)
}
}
}
}
}
}
#endif // ENABLE_ONEDNN_FOR_GPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "intel_gpu/op/gemm.hpp"
#include "intel_gpu/op/sdpa.hpp"
#include "intel_gpu/runtime/utils.hpp"
#include "openvino/core/node_vector.hpp"
#include "openvino/core/partial_shape.hpp"
#include "openvino/core/type/element_type.hpp"
Expand All @@ -27,9 +28,40 @@ using ov::pass::pattern::op::Or;
namespace ov {
namespace intel_gpu {

TransposeFusion::TransposeFusion() {
add_matcher<TransposeMatMulTransposeMatcher>();
add_matcher<TransposeMatMulMatcher>();
namespace {

bool has_optimized_version(const ov::Output<ov::Node>& output, bool supports_immad) {
if (!output.get_element_type().is_real())
return false;

if (output.get_partial_shape().is_static() && !supports_immad)
return false;

auto order_node = output.get_node()->get_input_node_shared_ptr(1);
if (!ov::is_type<ov::op::v0::Constant>(order_node))
return false;

auto transpose_order = std::dynamic_pointer_cast<ov::op::v0::Constant>(order_node)->cast_vector<int64_t>();
static const std::vector<std::vector<int64_t>> allowed_orders = {
{0, 1, 2, 3},
{0, 1, 3, 2},
{0, 2, 1, 3},
{0, 3, 1, 2},
{1, 2, 0, 3},
{2, 0, 1, 3},
{3, 0, 1, 2},
};

if (!cldnn::one_of(transpose_order, allowed_orders))
return false;

return true;
}
} // namespace

TransposeFusion::TransposeFusion(bool supports_immad) {
add_matcher<TransposeMatMulTransposeMatcher>(supports_immad);
add_matcher<TransposeMatMulMatcher>(supports_immad);
add_matcher<TransposeSDPAMatcher>();
}

Expand Down Expand Up @@ -150,37 +182,26 @@ TransposeSDPAMatcher::TransposeSDPAMatcher() {
this->register_matcher(m, callback);
}

TransposeMatMulMatcher::TransposeMatMulMatcher() {
auto is_fp_type = [](const ov::Output<ov::Node>& output) -> bool {
switch (output.get_element_type()) {
case ov::element::f16:
case ov::element::f32: return true;
default: return false;
}
};
auto not_transpose = [is_fp_type](const ov::Output<ov::Node>& output) -> bool {
TransposeMatMulMatcher::TransposeMatMulMatcher(bool supports_immad) {
auto not_transpose = [](const ov::Output<ov::Node>& output) -> bool {
return std::dynamic_pointer_cast<ov::op::v1::Transpose>(output.get_node_shared_ptr()) == nullptr
&& is_fp_type(output);
&& output.get_element_type().is_real();
};
auto is_dynamic = [](const ov::Output<ov::Node>& output) -> bool {
bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic();
size_t num_inputs = output.get_node_shared_ptr()->get_input_size();
for (size_t idx = 0; idx < num_inputs; idx++) {
is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic();
}
return is_dynamic;

auto transpose_predicate = [supports_immad](const ov::Output<ov::Node>& output) -> bool {
return has_optimized_version(output, supports_immad);
};
auto input_a_m = any_input(not_transpose);
auto input_b_m = any_input(not_transpose);
auto transpose_a_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto transpose_b_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto transpose_a_m = wrap_type<ov::op::v1::Transpose>({input_a_m, transpose_a_order_m}, is_fp_type);
auto transpose_b_m = wrap_type<ov::op::v1::Transpose>({input_b_m, transpose_b_order_m}, is_fp_type);
auto transpose_a_m = wrap_type<ov::op::v1::Transpose>({input_a_m, transpose_a_order_m}, transpose_predicate);
auto transpose_b_m = wrap_type<ov::op::v1::Transpose>({input_b_m, transpose_b_order_m}, transpose_predicate);

auto matmul_in_a = std::make_shared<Or>(OutputVector{input_a_m, transpose_a_m});
auto matmul_in_b = std::make_shared<Or>(OutputVector{input_b_m, transpose_b_m});

auto matmul_m = wrap_type<ov::op::v0::MatMul>({ matmul_in_a, matmul_in_b }, is_dynamic);
auto matmul_m = wrap_type<ov::op::v0::MatMul>({ matmul_in_a, matmul_in_b });

ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
Expand Down Expand Up @@ -234,39 +255,27 @@ TransposeMatMulMatcher::TransposeMatMulMatcher() {
this->register_matcher(m, callback);
}

TransposeMatMulTransposeMatcher::TransposeMatMulTransposeMatcher() {
auto is_fp_type = [](const ov::Output<ov::Node>& output) -> bool {
switch (output.get_element_type()) {
case ov::element::f16:
case ov::element::f32: return true;
default: return false;
}
};
auto not_transpose = [is_fp_type](const ov::Output<ov::Node>& output) -> bool {
TransposeMatMulTransposeMatcher::TransposeMatMulTransposeMatcher(bool supports_immad) {
auto not_transpose = [](const ov::Output<ov::Node>& output) -> bool {
return std::dynamic_pointer_cast<ov::op::v1::Transpose>(output.get_node_shared_ptr()) == nullptr
&& is_fp_type(output);
&& output.get_element_type().is_real();
};
auto is_dynamic = [](const ov::Output<ov::Node>& output) -> bool {
bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic();
size_t num_inputs = output.get_node_shared_ptr()->get_input_size();
for (size_t idx = 0; idx < num_inputs; idx++) {
is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic();
}
return is_dynamic;
auto transpose_predicate = [supports_immad](const ov::Output<ov::Node>& output) -> bool {
return has_optimized_version(output, supports_immad);
};
auto input_a_m = any_input(not_transpose);
auto input_b_m = any_input(not_transpose);
auto transpose_a_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto transpose_b_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto transpose_a_m = wrap_type<ov::op::v1::Transpose>({input_a_m, transpose_a_order_m}, is_fp_type);
auto transpose_b_m = wrap_type<ov::op::v1::Transpose>({input_b_m, transpose_b_order_m}, is_fp_type);
auto transpose_a_m = wrap_type<ov::op::v1::Transpose>({input_a_m, transpose_a_order_m}, transpose_predicate);
auto transpose_b_m = wrap_type<ov::op::v1::Transpose>({input_b_m, transpose_b_order_m}, transpose_predicate);

auto matmul_in_a = std::make_shared<Or>(OutputVector{input_a_m, transpose_a_m});
auto matmul_in_b = std::make_shared<Or>(OutputVector{input_b_m, transpose_b_m});

auto matmul_m = wrap_type<ov::op::v0::MatMul>({ matmul_in_a, matmul_in_b }, is_dynamic);
auto matmul_m = wrap_type<ov::op::v0::MatMul>({ matmul_in_a, matmul_in_b });
auto transpose_c_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto transpose_c_m = wrap_type<ov::op::v1::Transpose>({matmul_m, transpose_c_order_m});
auto transpose_c_m = wrap_type<ov::op::v1::Transpose>({matmul_m, transpose_c_order_m}, transpose_predicate);

ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,19 @@ namespace intel_gpu {
class TransposeFusion: public ov::pass::GraphRewrite {
public:
OPENVINO_RTTI("TransposeFusion", "0");
TransposeFusion();
TransposeFusion(bool supports_immad = false);
};

class TransposeMatMulMatcher : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("TransposeMatMulMatcher", "0");
TransposeMatMulMatcher();
TransposeMatMulMatcher(bool supports_immad);
};

class TransposeMatMulTransposeMatcher : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("TransposeMatMulTransposeMatcher", "0");
TransposeMatMulTransposeMatcher();
TransposeMatMulTransposeMatcher(bool supports_immad);
};

class TransposeSDPAMatcher : public ov::pass::MatcherPass {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::pass::RMSFusion>();
manager.register_pass<ov::intel_gpu::KVCacheFusion>();
manager.register_pass<ov::intel_gpu::FullyConnectedConvertFusion>();
manager.register_pass<ov::intel_gpu::TransposeFusion>();
manager.register_pass<ov::intel_gpu::TransposeFusion>(device_info.supports_immad);

if (!device_info.supports_immad) {
manager.register_pass<ov::intel_gpu::UnsqueezeBroadcastReshapeMatmulFusion>();
Expand Down
3 changes: 0 additions & 3 deletions src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ static void print_help_messages() {
message_list.emplace_back("OV_GPU_PrintInputDataShapes", "Print data_shapes of input layers for benchmark_app.");
message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage");
message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)");
message_list.emplace_back("OV_GPU_DisableOnednnPermuteFusion", "Disable permute fusion for onednn gemm (no effect for integrated GPU)");
message_list.emplace_back("OV_GPU_DisableOnednnOptPostOps", "Disable onednn optimize post operators");
message_list.emplace_back("OV_GPU_DumpProfilingData", "Enables dump of extended profiling information to specified directory."
" Please use OV_GPU_DumpProfilingDataPerIter=1 env variable to collect performance per iteration."
Expand Down Expand Up @@ -220,7 +219,6 @@ debug_configuration::debug_configuration()
, print_input_data_shapes(0)
, disable_usm(0)
, disable_onednn(0)
, disable_onednn_permute_fusion(0)
, disable_onednn_opt_post_ops(0)
, dump_profiling_data(std::string(""))
, dump_profiling_data_per_iter(0)
Expand Down Expand Up @@ -271,7 +269,6 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
get_gpu_debug_env_var("DumpLayersInput", dump_layers_input);
get_gpu_debug_env_var("DisableOnednn", disable_onednn);
get_gpu_debug_env_var("DisableOnednnPermuteFusion", disable_onednn_permute_fusion);
get_gpu_debug_env_var("DisableOnednnOptPostOps", disable_onednn_opt_post_ops);
get_gpu_debug_env_var("DumpProfilingData", dump_profiling_data);
get_gpu_debug_env_var("DumpProfilingDataPerIter", dump_profiling_data_per_iter);
Expand Down
Loading

0 comments on commit 1f9a6f8

Please sign in to comment.