diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp index 315a93190fdc90..0ed17f36c21d22 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp @@ -57,13 +57,16 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon auto reshape_const_m = wrap_type(); auto reshape_m = wrap_type({mul_m, reshape_const_m}, reshape_3d_to_2d); + auto mul2_const_m = wrap_type(); + auto mul2_m = wrap_type({reshape_m, mul2_const_m}); + auto transpose_input = std::make_shared(OutputVector{reshape_m, mul_m}); auto transpose_const_m = wrap_type(); auto transpose_m = wrap_type({transpose_input, transpose_const_m}); auto data_m = any_input(); auto bias_m = any_input(); - auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m, mul_m}); + auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m, mul_m, mul2_m}); auto fully_connected_m = wrap_type({data_m, weights_input_m, bias_m}); ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { @@ -131,6 +134,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon std::shared_ptr fc_input_zp = optional_zero_point; std::shared_ptr fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr(); std::vector> result_nodes = {}; + if (has_transpose) { const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr(); std::shared_ptr transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr(); @@ -151,6 +155,11 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon } } + if (pattern_map.count(mul2_m)) { + auto mul2_op_const = std::dynamic_pointer_cast(pattern_map.at(mul2_const_m).get_node_shared_ptr()); + fc_input_scale = ov::op::util::eltwise_fold(fc_input_scale, mul2_op_const).get_node_shared_ptr(); + } + std::shared_ptr new_fc = nullptr; if (with_zero_point) { new_fc = std::make_shared(fc_input_a, @@ -171,6 +180,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon new_fc->set_friendly_name(fc->get_friendly_name()); ov::copy_runtime_info(m.get_matched_nodes(), result_nodes); ov::replace_node(fc, new_fc); + return true; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index bfc348d135a813..158dee2ee7ac05 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -180,14 +180,29 @@ static bool is_decompression_multiply(const std::shared_ptr node if (all_has_types(consumers, { ov::op::v0::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) return true; + auto are_multiply_from_decompression = [&all_has_types](const ov::Input consumer) { + if (!cldnn::one_of(consumer.get_node()->get_type_info(), { ov::op::v1::Multiply::get_type_info_static() })) + return false; + const auto child_consumers = consumer.get_node()->get_output_target_inputs(0); + if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) + return true; + return false; + }; - auto are_converts_from_decompression = [&all_has_types](const std::set>& consumers) { + auto are_converts_from_decompression = [&all_has_types, &are_multiply_from_decompression](const std::set>& consumers) { if (!all_has_types(consumers, { ov::opset1::Convert::get_type_info_static() })) return false; for (const auto& consumer : consumers) { const auto child_consumers = consumer.get_node()->get_output_target_inputs(0); - if (!all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) + for (const auto& child_consumer : child_consumers) { + const auto& type_info = child_consumer.get_node()->get_type_info(); + if (cldnn::one_of(type_info, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) + continue; + if (are_multiply_from_decompression(child_consumer)) { + continue; + } return false; + } } return true; }; diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index fee6a02b6671d8..27c57aa072878d 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -30,7 +30,7 @@ using ov::test::InputShape; * \ / * Multiply * | - * Data(F32) Transpose(optional) + * Data(F32) Transpose(optional) or Multiply(optional) * \ / * Matmul * | @@ -56,6 +56,7 @@ using MatmulWeightsDecompressionParams = std::tuple; @@ -70,6 +71,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface