From 96ecfc096330d0472ceb283423569371b884a7b9 Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Wed, 10 Jul 2024 10:41:01 +0800
Subject: [PATCH 1/9] Filter out small weights for GatherCompressed. Reasons:
 1: Small size weights have little impact on compile_model performance; 2:
 There is a high probability that constfolding will not be performed when
 compile_model;

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../op_conversions/convert_gather_to_compressed.cpp      | 4 +++-
 .../src/transformations/transformation_pipeline.cpp      | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
index 3d1ad8ff2b3b6a..156481fb893227 100644
--- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
@@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
                                                                                    gather_input_scale);
         }
 
-        transformation_callback(new_gather_node);
+        if (transformation_callback(new_gather_node)) {
+            return false;
+        }
 
         result_nodes.push_back(new_gather_node);
         new_gather_node->set_friendly_name(gather_node->get_friendly_name());
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index bd3397245f5a26..55a52cd880ecf1 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -328,6 +328,15 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
                 // It is necessary to avoid precision conversion for constant node(compressed weights)
                 ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));
+
+                // Heuristic:
+                // 1: Small size weights have little impact on compile_model performance;
+                // 2: There is a high probability that constfolding will not be performed when compile_model;
+                const auto& input_partial_shape = node->get_input_partial_shape(0);
+                const auto& rank = input_partial_shape.rank();
+                if (rank.is_static() && (rank.get_length() == 2)) {
+                    return ov::shape_size<ov::Shape>(input_partial_shape.get_shape()) < 256u * 512u;
+                }
             }
             return false;
         },

From 1420794cec8bb2a2d3c33e2618cfe4e31dcee2e1 Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Wed, 10 Jul 2024 14:29:34 +0800
Subject: [PATCH 2/9] ov::pass::ConvertGatherToGatherCompressed break original
 structure of model versa. disable it when useLpt == false.

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../convert_gather_to_compressed.cpp              |  4 +---
 .../transformations/transformation_pipeline.cpp   | 15 ++++-----------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
index 156481fb893227..3d1ad8ff2b3b6a 100644
--- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
@@ -134,9 +134,7 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
                                                                                    gather_input_scale);
         }
 
-        if (transformation_callback(new_gather_node)) {
-            return false;
-        }
+        transformation_callback(new_gather_node);
 
         result_nodes.push_back(new_gather_node);
         new_gather_node->set_friendly_name(gather_node->get_friendly_name());
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 55a52cd880ecf1..4a4d0b270f18db 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -308,7 +308,10 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     ov::pass::Manager decompression_handling_manager;
     decompression_handling_manager.set_per_pass_validation(false);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo);
-    CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
+    const bool useLpt = !defaultPrecisions.empty();
+    if (!useLpt) {
+        CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
+    }
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
     // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
     CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul);
@@ -328,15 +331,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
                 // It is necessary to avoid precision conversion for constant node(compressed weights)
                 ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));
-
-                // Heuristic:
-                // 1: Small size weights have little impact on compile_model performance;
-                // 2: There is a high probability that constfolding will not be performed when compile_model;
-                const auto& input_partial_shape = node->get_input_partial_shape(0);
-                const auto& rank = input_partial_shape.rank();
-                if (rank.is_static() && (rank.get_length() == 2)) {
-                    return ov::shape_size<ov::Shape>(input_partial_shape.get_shape()) < 256u * 512u;
-                }
             }
             return false;
         },
@@ -345,7 +339,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
 
     ov::pass::Manager manager;
     manager.set_per_pass_validation(false);
-    const bool useLpt = !defaultPrecisions.empty();
     if (useLpt)
         CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions);
 

From 8df41d4ec81147a57c01592f21fbb2f5b15b2a04 Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Fri, 12 Jul 2024 09:59:10 +0000
Subject: [PATCH 3/9] Updated 1: Add test 2: When u8/i8 + useLpt, disable
 "ConvertGatherToGatherCompressed".

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../convert_gather_to_compressed.cpp          |   4 +-
 .../transformation_pipeline.cpp               |  12 +-
 ...sable_gathercompressed_quantized_model.cpp | 130 ++++++++++++++++++
 3 files changed, 142 insertions(+), 4 deletions(-)
 create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp

diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
index 3d1ad8ff2b3b6a..156481fb893227 100644
--- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
@@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
                                                                                    gather_input_scale);
         }
 
-        transformation_callback(new_gather_node);
+        if (transformation_callback(new_gather_node)) {
+            return false;
+        }
 
         result_nodes.push_back(new_gather_node);
         new_gather_node->set_friendly_name(gather_node->get_friendly_name());
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index f17fe8adadb347..b46ff64fbe160f 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -311,9 +311,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     decompression_handling_manager.set_per_pass_validation(false);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo);
     const bool useLpt = !defaultPrecisions.empty();
-    if (!useLpt) {
-        CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
-    }
+    CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
     // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
     CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul);
@@ -333,6 +331,14 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
                 // It is necessary to avoid precision conversion for constant node(compressed weights)
                 ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));
+                if (std::getenv("WITH_PR")) {
+                    if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
+                                              ov::element::u8,
+                                              ov::element::i8) &&
+                        useLpt) {
+                        return true;
+                    }
+                }
             }
             return false;
         },
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
new file mode 100644
index 00000000000000..9d2ad390642206
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
@@ -0,0 +1,130 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/data_utils.hpp"
+#include "openvino/runtime/exec_model_info.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+/*
+ *                    input2
+ *                      |
+ *  Constant(i8)     Softmax
+ *       |            /
+ *    Convert     Multiply
+ *       |          /
+ *    Multiply  Convert   input1(u8/i8)
+ *         \     /          |
+ *          Gather     FakeQuantize
+ *              \       /
+ *               \     /
+ *               MatMul
+ */
+using DisableGatherCompressedForQuantizedModelParams = std::tuple<element::Type, InputShape, InputShape>;
+class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterface<DisableGatherCompressedForQuantizedModelParams>,
+                                                 virtual public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<DisableGatherCompressedForQuantizedModelParams> obj) {
+        element::Type weight_prec;
+        InputShape inputShape1, inputShape2;
+        std::tie(weight_prec, inputShape1, inputShape2) = obj.param;
+        std::ostringstream result;
+        result << "weight_prec=" << weight_prec << "_" << "inputShape1=" << inputShape1 << "_"
+               << "inputShape2=" << inputShape2;
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        element::Type weight_prec;
+        InputShape inputShape1, inputShape2;
+        std::tie(weight_prec, inputShape1, inputShape2) = GetParam();
+
+        // auto input_shape1 = Shape{1, 3, 64, 64};
+        // auto input_shape2 = Shape{32};
+        init_input_shapes({inputShape1, inputShape2});
+
+        targetDevice = test::utils::DEVICE_CPU;
+        auto type = element::f32;
+
+        auto input1 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[0]);
+        auto input2 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[1]);
+
+        auto shared_il = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
+        auto shared_ih = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
+        auto shared_ol = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
+        auto shared_oh = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
+        auto fq = std::make_shared<op::v0::FakeQuantize>(input1, shared_il, shared_ih, shared_ol, shared_oh, 256);
+
+        // Weights
+        auto weights_shape = Shape{64, 64};
+        auto weights_vals = test::utils::generate_float_numbers(shape_size(weights_shape), -1, 1);
+        auto weights = op::v0::Constant::create(weight_prec, weights_shape, weights_vals);
+        auto convert = std::make_shared<op::v0::Convert>(weights, element::f32);
+        auto multiply = std::make_shared<op::v1::Multiply>(convert, op::v0::Constant::create(type, {1, 1}, {0.625}));
+        // Indics
+        auto softmax = std::make_shared<op::v1::Softmax>(input2, 0);
+        auto multiply2 = std::make_shared<op::v1::Multiply>(softmax, op::v0::Constant::create(type, {1}, {64}));
+        auto indics = std::make_shared<op::v0::Convert>(multiply2, element::i64);
+        // Gather
+        auto gather =
+            std::make_shared<op::v8::Gather>(multiply, indics, op::v0::Constant::create(element::i32, Shape{1}, {0}));
+
+        auto matMul = std::make_shared<ov::op::v0::MatMul>(fq, gather, false, true);
+
+        function = std::make_shared<Model>(matMul, ParameterVector{input1, input2});
+    }
+
+    void check_results() {
+        const auto& test_param = GetParam();
+        const auto compressed_weights_precision = std::get<0>(test_param);
+
+        const auto runtime_model = compiledModel.get_runtime_model();
+        const auto result = runtime_model->get_result();
+        const auto matmul = result->get_input_node_shared_ptr(0);
+        if (compressed_weights_precision == element::i8) {
+            EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
+        } else if (compressed_weights_precision == element::u8) {
+            // oneDNN MutMul support precision: Source(u8, s8), Weights(s8)
+            // So reorder will be inserted when weights is not s8.
+            const auto mm_productor = matmul->get_input_node_shared_ptr(1);
+            const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+            EXPECT_EQ(type, "Reorder");
+            EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision);
+        } else {
+            // Keep GatherCompressed, so just check if Gather has 4 inputs.
+            for (const auto& n : runtime_model->get_ordered_ops()) {
+                const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+                if (type == "Gather") {
+                    EXPECT_GE(n->get_input_size(), 4);
+                }
+            }
+        }
+    }
+};
+
+TEST_P(DisableGatherCompressedForQuantizedModel, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    run();
+    check_results();
+}
+
+namespace {
+
+const std::vector<InputShape> inputShapes1 = {{{-1, 3, -1, -1}, {{1, 3, 64, 64}}}};
+const std::vector<InputShape> inputShapes2 = {{{}, {{32}}}};
+const std::vector<element::Type> weightsPrecisions = {element::i8, element::u8, element::u4, element::i4};
+
+INSTANTIATE_TEST_SUITE_P(smoke_DisableGatherCompressedForQuantizedModel_basic,
+                         DisableGatherCompressedForQuantizedModel,
+                         ::testing::Combine(::testing::ValuesIn(weightsPrecisions),
+                                            ::testing::ValuesIn(inputShapes1),
+                                            ::testing::ValuesIn(inputShapes2)),
+                         DisableGatherCompressedForQuantizedModel::getTestCaseName);
+
+}  // namespace
+}  // namespace test
+}  // namespace ov

From 39b903d779926bb24d66555b253b1009f4230ebf Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Fri, 12 Jul 2024 10:02:42 +0000
Subject: [PATCH 4/9] Remove debug code

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../src/transformations/transformation_pipeline.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index b46ff64fbe160f..879ca16566c7b0 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -331,13 +331,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
                 // It is necessary to avoid precision conversion for constant node(compressed weights)
                 ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));
-                if (std::getenv("WITH_PR")) {
-                    if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
-                                              ov::element::u8,
-                                              ov::element::i8) &&
-                        useLpt) {
-                        return true;
-                    }
+
+                if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
+                                          ov::element::u8,
+                                          ov::element::i8) &&
+                    useLpt) {
+                    return true;
                 }
             }
             return false;

From 3b3d093099fc69bedfa923ab87bed68c6f7cd47e Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Thu, 18 Jul 2024 09:03:42 +0800
Subject: [PATCH 5/9] Add comments

---
 .../intel_cpu/src/transformations/transformation_pipeline.cpp   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 879ca16566c7b0..74ab29b430e075 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -332,6 +332,8 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
                 // It is necessary to avoid precision conversion for constant node(compressed weights)
                 ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));
 
+                // Prioritize LPT pipeline to handle dequantization part for quantized models as it more optimal in
+                // general case
                 if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
                                           ov::element::u8,
                                           ov::element::i8) &&

From d0b5d15ab5d1654d7f7e304f5559196cbf829b5b Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Thu, 18 Jul 2024 09:56:23 +0800
Subject: [PATCH 6/9] Just check MutMul input(1) precision.

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../src/disable_gathercompressed_quantized_model.cpp     | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
index 9d2ad390642206..0ca4fdd8baa1c2 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
@@ -88,12 +88,9 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf
         if (compressed_weights_precision == element::i8) {
             EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
         } else if (compressed_weights_precision == element::u8) {
-            // oneDNN MutMul support precision: Source(u8, s8), Weights(s8)
-            // So reorder will be inserted when weights is not s8.
-            const auto mm_productor = matmul->get_input_node_shared_ptr(1);
-            const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
-            EXPECT_EQ(type, "Reorder");
-            EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision);
+            // oneDNN MutMul support precision: Source(u8, s8), Weights(s8), So Reorder will be inserted when weights is
+            // not s8. The output precision of Reorder will be f32
+            EXPECT_EQ(matmul->get_input_element_type(1), element::f32);
         } else {
             // Keep GatherCompressed, so just check if Gather has 4 inputs.
             for (const auto& n : runtime_model->get_ordered_ops()) {

From 830c31dd04aa5faa28e2e98b9b88994000d8a6db Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Thu, 18 Jul 2024 14:28:56 +0800
Subject: [PATCH 7/9] Replace generate_float_numbers with make_constant;

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../src/disable_gathercompressed_quantized_model.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
index 0ca4fdd8baa1c2..7eea8ce3d33878 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
@@ -1,8 +1,9 @@
-// Copyright (C) 2022 Intel Corporation
+// Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "common_test_utils/data_utils.hpp"
+#include "common_test_utils/node_builders/constant.hpp"
 #include "openvino/runtime/exec_model_info.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 
@@ -38,16 +39,14 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf
 
 protected:
     void SetUp() override {
-        targetDevice = ov::test::utils::DEVICE_CPU;
+        targetDevice = utils::DEVICE_CPU;
         element::Type weight_prec;
         InputShape inputShape1, inputShape2;
         std::tie(weight_prec, inputShape1, inputShape2) = GetParam();
 
-        // auto input_shape1 = Shape{1, 3, 64, 64};
-        // auto input_shape2 = Shape{32};
         init_input_shapes({inputShape1, inputShape2});
 
-        targetDevice = test::utils::DEVICE_CPU;
+        targetDevice = utils::DEVICE_CPU;
         auto type = element::f32;
 
         auto input1 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[0]);
@@ -61,8 +60,7 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf
 
         // Weights
         auto weights_shape = Shape{64, 64};
-        auto weights_vals = test::utils::generate_float_numbers(shape_size(weights_shape), -1, 1);
-        auto weights = op::v0::Constant::create(weight_prec, weights_shape, weights_vals);
+        auto weights = utils::make_constant(weight_prec, weights_shape, utils::InputGenerateData(-1, 2, 32768));
         auto convert = std::make_shared<op::v0::Convert>(weights, element::f32);
         auto multiply = std::make_shared<op::v1::Multiply>(convert, op::v0::Constant::create(type, {1, 1}, {0.625}));
         // Indics

From ef093a1fed00d4b6a7a26ce10623ea821b40f832 Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Fri, 19 Jul 2024 11:14:20 +0800
Subject: [PATCH 8/9] Revert to initial version(just check reorder precision is
 U8 when embedding precision is I8)

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 .../src/disable_gathercompressed_quantized_model.cpp   | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
index 7eea8ce3d33878..d04d2c76f5335f 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
@@ -86,9 +86,13 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf
         if (compressed_weights_precision == element::i8) {
             EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
         } else if (compressed_weights_precision == element::u8) {
-            // oneDNN MutMul support precision: Source(u8, s8), Weights(s8), So Reorder will be inserted when weights is
-            // not s8. The output precision of Reorder will be f32
-            EXPECT_EQ(matmul->get_input_element_type(1), element::f32);
+            // Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8),
+            // It doesn't support:Source(u8), Weights(u8)
+            // So reorder will be inserted when weights is not s8.
+            const auto mm_productor = matmul->get_input_node_shared_ptr(1);
+            const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+            EXPECT_EQ(type, "Reorder");
+            EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision);
         } else {
             // Keep GatherCompressed, so just check if Gather has 4 inputs.
             for (const auto& n : runtime_model->get_ordered_ops()) {

From 6b3037483f11b49a527c54e75c88f38cf6ea1d3f Mon Sep 17 00:00:00 2001
From: xipingya <xiping.yan@intel.com>
Date: Fri, 19 Jul 2024 14:37:15 +0800
Subject: [PATCH 9/9] 1: u4/i4: check if exist GatherCompressed 2: u8/i8: check
 if exist Gather 3: i8: check MatMul's runtime precision is u8;

Signed-off-by: xipingya <xiping.yan@intel.com>
---
 ...sable_gathercompressed_quantized_model.cpp | 51 ++++++++++++-------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
index d04d2c76f5335f..6df52b33e1a3fe 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/disable_gathercompressed_quantized_model.cpp
@@ -81,27 +81,42 @@ class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterf
         const auto compressed_weights_precision = std::get<0>(test_param);
 
         const auto runtime_model = compiledModel.get_runtime_model();
-        const auto result = runtime_model->get_result();
-        const auto matmul = result->get_input_node_shared_ptr(0);
-        if (compressed_weights_precision == element::i8) {
-            EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
-        } else if (compressed_weights_precision == element::u8) {
-            // Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8),
-            // It doesn't support:Source(u8), Weights(u8)
-            // So reorder will be inserted when weights is not s8.
-            const auto mm_productor = matmul->get_input_node_shared_ptr(1);
-            const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
-            EXPECT_EQ(type, "Reorder");
-            EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision);
-        } else {
-            // Keep GatherCompressed, so just check if Gather has 4 inputs.
-            for (const auto& n : runtime_model->get_ordered_ops()) {
-                const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
-                if (type == "Gather") {
-                    EXPECT_GE(n->get_input_size(), 4);
+        const auto matmul = runtime_model->get_result()->get_input_node_shared_ptr(0);
+
+        bool have_gather = false;
+        bool have_gather_compressed = false;
+        for (const auto& n : runtime_model->get_ordered_ops()) {
+            const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+            if (type == "Gather") {
+                // Gather has >=4 inputs means it is GatherCompressed.
+                if (n->get_input_size() >= 4) {
+                    have_gather_compressed = true;
+                } else {
+                    have_gather = true;
                 }
             }
         }
+
+        switch (compressed_weights_precision) {
+        case element::i8:
+            EXPECT_TRUE(have_gather);
+            EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
+            // FakeQuantize(matmul's input(0))'s output precision is u8
+            EXPECT_EQ(matmul->get_rt_info().at(ov::exec_model_info::RUNTIME_PRECISION).as<ov::element::Type>(),
+                      element::u8);
+            break;
+        case element::u8:
+            EXPECT_TRUE(have_gather);
+            // Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8).
+            // So reorder will be inserted when weights is not s8, don't need to check matmul's input(1) precision.
+            break;
+        case element::u4:
+        case element::i4:
+            EXPECT_TRUE(have_gather_compressed);
+            break;
+        default:
+            break;
+        }
     }
 };