From 96ecfc096330d0472ceb283423569371b884a7b9 Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 10 Jul 2024 10:41:01 +0800 Subject: [PATCH 1/9] Filter out small weights for GatherCompressed. Reasons: 1: Small size weights have little impact on compile_model performance; 2: There is a high probability that constfolding will not be performed when compile_model; Signed-off-by: xipingya --- .../op_conversions/convert_gather_to_compressed.cpp | 4 +++- .../src/transformations/transformation_pipeline.cpp | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp index 3d1ad8ff2b3b6a..156481fb893227 100644 --- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp +++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp @@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() { gather_input_scale); } - transformation_callback(new_gather_node); + if (transformation_callback(new_gather_node)) { + return false; + } result_nodes.push_back(new_gather_node); new_gather_node->set_friendly_name(gather_node->get_friendly_name()); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index bd3397245f5a26..55a52cd880ecf1 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -328,6 +328,15 @@ void Transformations::PreLpt(const std::vector& defaultPrecis if (ov::is_type(node)) { // It is necessary to avoid precision conversion for constant node(compressed weights) ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0)); + + // Heuristic: + // 1: Small size weights have little impact on compile_model performance; + // 2: There is a high probability that constfolding will not be performed when compile_model; + const auto& input_partial_shape = node->get_input_partial_shape(0); + const auto& rank = input_partial_shape.rank(); + if (rank.is_static() && (rank.get_length() == 2)) { + return ov::shape_size(input_partial_shape.get_shape()) < 256u * 512u; + } } return false; }, From 1420794cec8bb2a2d3c33e2618cfe4e31dcee2e1 Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 10 Jul 2024 14:29:34 +0800 Subject: [PATCH 2/9] ov::pass::ConvertGatherToGatherCompressed break original structure of model versa. disable it when useLpt == false. Signed-off-by: xipingya --- .../convert_gather_to_compressed.cpp | 4 +--- .../transformations/transformation_pipeline.cpp | 15 ++++----------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp index 156481fb893227..3d1ad8ff2b3b6a 100644 --- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp +++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp @@ -134,9 +134,7 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() { gather_input_scale); } - if (transformation_callback(new_gather_node)) { - return false; - } + transformation_callback(new_gather_node); result_nodes.push_back(new_gather_node); new_gather_node->set_friendly_name(gather_node->get_friendly_name()); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 55a52cd880ecf1..4a4d0b270f18db 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -308,7 +308,10 @@ void Transformations::PreLpt(const std::vector& defaultPrecis ov::pass::Manager decompression_handling_manager; decompression_handling_manager.set_per_pass_validation(false); CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo); - CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed); + const bool useLpt = !defaultPrecisions.empty(); + if (!useLpt) { + CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed); + } CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs); // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul); @@ -328,15 +331,6 @@ void Transformations::PreLpt(const std::vector& defaultPrecis if (ov::is_type(node)) { // It is necessary to avoid precision conversion for constant node(compressed weights) ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0)); - - // Heuristic: - // 1: Small size weights have little impact on compile_model performance; - // 2: There is a high probability that constfolding will not be performed when compile_model; - const auto& input_partial_shape = node->get_input_partial_shape(0); - const auto& rank = input_partial_shape.rank(); - if (rank.is_static() && (rank.get_length() == 2)) { - return ov::shape_size(input_partial_shape.get_shape()) < 256u * 512u; - } } return false; }, @@ -345,7 +339,6 @@ void Transformations::PreLpt(const std::vector& defaultPrecis ov::pass::Manager manager; manager.set_per_pass_validation(false); - const bool useLpt = !defaultPrecisions.empty(); if (useLpt) CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions); From 8df41d4ec81147a57c01592f21fbb2f5b15b2a04 Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 12 Jul 2024 09:59:10 +0000 Subject: [PATCH 3/9] Updated 1: Add test 2: When u8/i8 + useLpt, disable "ConvertGatherToGatherCompressed". Signed-off-by: xipingya --- .../convert_gather_to_compressed.cpp | 4 +- .../transformation_pipeline.cpp | 12 +- ...sable_gathercompressed_quantized_model.cpp | 130 ++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp index 3d1ad8ff2b3b6a..156481fb893227 100644 --- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp +++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp @@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() { gather_input_scale); } - transformation_callback(new_gather_node); + if (transformation_callback(new_gather_node)) { + return false; + } result_nodes.push_back(new_gather_node); new_gather_node->set_friendly_name(gather_node->get_friendly_name()); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index f17fe8adadb347..b46ff64fbe160f 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -311,9 +311,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis decompression_handling_manager.set_per_pass_validation(false); CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo); const bool useLpt = !defaultPrecisions.empty(); - if (!useLpt) { - CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed); - } + CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed); CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs); // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul); @@ -333,6 +331,14 @@ void Transformations::PreLpt(const std::vector& defaultPrecis if (ov::is_type(node)) { // It is necessary to avoid precision conversion for constant node(compressed weights) ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0)); + if (std::getenv("WITH_PR")) { + if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(), + ov::element::u8, + ov::element::i8) && + useLpt) { + return true; + } + } } return false; }, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp new file mode 100644 index 00000000000000..9d2ad390642206 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/data_utils.hpp" +#include "openvino/runtime/exec_model_info.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace ov { +namespace test { +/* + * input2 + * | + * Constant(i8) Softmax + * | / + * Convert Multiply + * | / + * Multiply Convert input1(u8/i8) + * \ / | + * Gather FakeQuantize + * \ / + * \ / + * MatMul + */ +using DisableGatherCompressedForQuantizedModelParams = std::tuple; +class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterface, + virtual public SubgraphBaseTest { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + element::Type weight_prec; + InputShape inputShape1, inputShape2; + std::tie(weight_prec, inputShape1, inputShape2) = obj.param; + std::ostringstream result; + result << "weight_prec=" << weight_prec << "_" << "inputShape1=" << inputShape1 << "_" + << "inputShape2=" << inputShape2; + return result.str(); + } + +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + element::Type weight_prec; + InputShape inputShape1, inputShape2; + std::tie(weight_prec, inputShape1, inputShape2) = GetParam(); + + // auto input_shape1 = Shape{1, 3, 64, 64}; + // auto input_shape2 = Shape{32}; + init_input_shapes({inputShape1, inputShape2}); + + targetDevice = test::utils::DEVICE_CPU; + auto type = element::f32; + + auto input1 = std::make_shared(type, inputDynamicShapes[0]); + auto input2 = std::make_shared(type, inputDynamicShapes[1]); + + auto shared_il = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f}); + auto shared_ih = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f}); + auto shared_ol = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f}); + auto shared_oh = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f}); + auto fq = std::make_shared(input1, shared_il, shared_ih, shared_ol, shared_oh, 256); + + // Weights + auto weights_shape = Shape{64, 64}; + auto weights_vals = test::utils::generate_float_numbers(shape_size(weights_shape), -1, 1); + auto weights = op::v0::Constant::create(weight_prec, weights_shape, weights_vals); + auto convert = std::make_shared(weights, element::f32); + auto multiply = std::make_shared(convert, op::v0::Constant::create(type, {1, 1}, {0.625})); + // Indics + auto softmax = std::make_shared(input2, 0); + auto multiply2 = std::make_shared(softmax, op::v0::Constant::create(type, {1}, {64})); + auto indics = std::make_shared(multiply2, element::i64); + // Gather + auto gather = + std::make_shared(multiply, indics, op::v0::Constant::create(element::i32, Shape{1}, {0})); + + auto matMul = std::make_shared(fq, gather, false, true); + + function = std::make_shared(matMul, ParameterVector{input1, input2}); + } + + void check_results() { + const auto& test_param = GetParam(); + const auto compressed_weights_precision = std::get<0>(test_param); + + const auto runtime_model = compiledModel.get_runtime_model(); + const auto result = runtime_model->get_result(); + const auto matmul = result->get_input_node_shared_ptr(0); + if (compressed_weights_precision == element::i8) { + EXPECT_EQ(matmul->get_input_element_type(1), element::i8); + } else if (compressed_weights_precision == element::u8) { + // oneDNN MutMul support precision: Source(u8, s8), Weights(s8) + // So reorder will be inserted when weights is not s8. + const auto mm_productor = matmul->get_input_node_shared_ptr(1); + const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + EXPECT_EQ(type, "Reorder"); + EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision); + } else { + // Keep GatherCompressed, so just check if Gather has 4 inputs. + for (const auto& n : runtime_model->get_ordered_ops()) { + const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + if (type == "Gather") { + EXPECT_GE(n->get_input_size(), 4); + } + } + } + } +}; + +TEST_P(DisableGatherCompressedForQuantizedModel, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); + check_results(); +} + +namespace { + +const std::vector inputShapes1 = {{{-1, 3, -1, -1}, {{1, 3, 64, 64}}}}; +const std::vector inputShapes2 = {{{}, {{32}}}}; +const std::vector weightsPrecisions = {element::i8, element::u8, element::u4, element::i4}; + +INSTANTIATE_TEST_SUITE_P(smoke_DisableGatherCompressedForQuantizedModel_basic, + DisableGatherCompressedForQuantizedModel, + ::testing::Combine(::testing::ValuesIn(weightsPrecisions), + ::testing::ValuesIn(inputShapes1), + ::testing::ValuesIn(inputShapes2)), + DisableGatherCompressedForQuantizedModel::getTestCaseName); + +} // namespace +} // namespace test +} // namespace ov From 39b903d779926bb24d66555b253b1009f4230ebf Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 12 Jul 2024 10:02:42 +0000 Subject: [PATCH 4/9] Remove debug code Signed-off-by: xipingya --- .../src/transformations/transformation_pipeline.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index b46ff64fbe160f..879ca16566c7b0 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -331,13 +331,12 @@ void Transformations::PreLpt(const std::vector& defaultPrecis if (ov::is_type(node)) { // It is necessary to avoid precision conversion for constant node(compressed weights) ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0)); - if (std::getenv("WITH_PR")) { - if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(), - ov::element::u8, - ov::element::i8) && - useLpt) { - return true; - } + + if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(), + ov::element::u8, + ov::element::i8) && + useLpt) { + return true; } } return false; From 3b3d093099fc69bedfa923ab87bed68c6f7cd47e Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Jul 2024 09:03:42 +0800 Subject: [PATCH 5/9] Add comments --- .../intel_cpu/src/transformations/transformation_pipeline.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 879ca16566c7b0..74ab29b430e075 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -332,6 +332,8 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // It is necessary to avoid precision conversion for constant node(compressed weights) ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0)); + // Prioritize LPT pipeline to handle dequantization part for quantized models as it more optimal in + // general case if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(), ov::element::u8, ov::element::i8) && From d0b5d15ab5d1654d7f7e304f5559196cbf829b5b Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Jul 2024 09:56:23 +0800 Subject: [PATCH 6/9] Just check MutMul input(1) precision. Signed-off-by: xipingya --- .../src/disable_gathercompressed_quantized_model.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp index 9d2ad390642206..0ca4fdd8baa1c2 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp @@ -88,12 +88,9 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf if (compressed_weights_precision == element::i8) { EXPECT_EQ(matmul->get_input_element_type(1), element::i8); } else if (compressed_weights_precision == element::u8) { - // oneDNN MutMul support precision: Source(u8, s8), Weights(s8) - // So reorder will be inserted when weights is not s8. - const auto mm_productor = matmul->get_input_node_shared_ptr(1); - const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); - EXPECT_EQ(type, "Reorder"); - EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision); + // oneDNN MutMul support precision: Source(u8, s8), Weights(s8), So Reorder will be inserted when weights is + // not s8. The output precision of Reorder will be f32 + EXPECT_EQ(matmul->get_input_element_type(1), element::f32); } else { // Keep GatherCompressed, so just check if Gather has 4 inputs. for (const auto& n : runtime_model->get_ordered_ops()) { From 830c31dd04aa5faa28e2e98b9b88994000d8a6db Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Jul 2024 14:28:56 +0800 Subject: [PATCH 7/9] Replace generate_float_numbers with make_constant; Signed-off-by: xipingya --- .../src/disable_gathercompressed_quantized_model.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp index 0ca4fdd8baa1c2..7eea8ce3d33878 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp @@ -1,8 +1,9 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "common_test_utils/data_utils.hpp" +#include "common_test_utils/node_builders/constant.hpp" #include "openvino/runtime/exec_model_info.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" @@ -38,16 +39,14 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf protected: void SetUp() override { - targetDevice = ov::test::utils::DEVICE_CPU; + targetDevice = utils::DEVICE_CPU; element::Type weight_prec; InputShape inputShape1, inputShape2; std::tie(weight_prec, inputShape1, inputShape2) = GetParam(); - // auto input_shape1 = Shape{1, 3, 64, 64}; - // auto input_shape2 = Shape{32}; init_input_shapes({inputShape1, inputShape2}); - targetDevice = test::utils::DEVICE_CPU; + targetDevice = utils::DEVICE_CPU; auto type = element::f32; auto input1 = std::make_shared(type, inputDynamicShapes[0]); @@ -61,8 +60,7 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf // Weights auto weights_shape = Shape{64, 64}; - auto weights_vals = test::utils::generate_float_numbers(shape_size(weights_shape), -1, 1); - auto weights = op::v0::Constant::create(weight_prec, weights_shape, weights_vals); + auto weights = utils::make_constant(weight_prec, weights_shape, utils::InputGenerateData(-1, 2, 32768)); auto convert = std::make_shared(weights, element::f32); auto multiply = std::make_shared(convert, op::v0::Constant::create(type, {1, 1}, {0.625})); // Indics From ef093a1fed00d4b6a7a26ce10623ea821b40f832 Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 19 Jul 2024 11:14:20 +0800 Subject: [PATCH 8/9] Revert to initial version(just check reorder precision is U8 when embedding precision is I8) Signed-off-by: xipingya --- .../src/disable_gathercompressed_quantized_model.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp index 7eea8ce3d33878..d04d2c76f5335f 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp @@ -86,9 +86,13 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf if (compressed_weights_precision == element::i8) { EXPECT_EQ(matmul->get_input_element_type(1), element::i8); } else if (compressed_weights_precision == element::u8) { - // oneDNN MutMul support precision: Source(u8, s8), Weights(s8), So Reorder will be inserted when weights is - // not s8. The output precision of Reorder will be f32 - EXPECT_EQ(matmul->get_input_element_type(1), element::f32); + // Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8), + // It doesn't support:Source(u8), Weights(u8) + // So reorder will be inserted when weights is not s8. + const auto mm_productor = matmul->get_input_node_shared_ptr(1); + const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + EXPECT_EQ(type, "Reorder"); + EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision); } else { // Keep GatherCompressed, so just check if Gather has 4 inputs. for (const auto& n : runtime_model->get_ordered_ops()) { From 6b3037483f11b49a527c54e75c88f38cf6ea1d3f Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 19 Jul 2024 14:37:15 +0800 Subject: [PATCH 9/9] 1: u4/i4: check if exist GatherCompressed 2: u8/i8: check if exist Gather 3: i8: check MatMul's runtime precision is u8; Signed-off-by: xipingya --- ...sable_gathercompressed_quantized_model.cpp | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp index d04d2c76f5335f..6df52b33e1a3fe 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp @@ -81,27 +81,42 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf const auto compressed_weights_precision = std::get<0>(test_param); const auto runtime_model = compiledModel.get_runtime_model(); - const auto result = runtime_model->get_result(); - const auto matmul = result->get_input_node_shared_ptr(0); - if (compressed_weights_precision == element::i8) { - EXPECT_EQ(matmul->get_input_element_type(1), element::i8); - } else if (compressed_weights_precision == element::u8) { - // Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8), - // It doesn't support:Source(u8), Weights(u8) - // So reorder will be inserted when weights is not s8. - const auto mm_productor = matmul->get_input_node_shared_ptr(1); - const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); - EXPECT_EQ(type, "Reorder"); - EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision); - } else { - // Keep GatherCompressed, so just check if Gather has 4 inputs. - for (const auto& n : runtime_model->get_ordered_ops()) { - const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); - if (type == "Gather") { - EXPECT_GE(n->get_input_size(), 4); + const auto matmul = runtime_model->get_result()->get_input_node_shared_ptr(0); + + bool have_gather = false; + bool have_gather_compressed = false; + for (const auto& n : runtime_model->get_ordered_ops()) { + const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + if (type == "Gather") { + // Gather has >=4 inputs means it is GatherCompressed. + if (n->get_input_size() >= 4) { + have_gather_compressed = true; + } else { + have_gather = true; } } } + + switch (compressed_weights_precision) { + case element::i8: + EXPECT_TRUE(have_gather); + EXPECT_EQ(matmul->get_input_element_type(1), element::i8); + // FakeQuantize(matmul's input(0))'s output precision is u8 + EXPECT_EQ(matmul->get_rt_info().at(ov::exec_model_info::RUNTIME_PRECISION).as(), + element::u8); + break; + case element::u8: + EXPECT_TRUE(have_gather); + // Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8). + // So reorder will be inserted when weights is not s8, don't need to check matmul's input(1) precision. + break; + case element::u4: + case element::i4: + EXPECT_TRUE(have_gather_compressed); + break; + default: + break; + } } };