openvinotoolkit · dmitry-gorokhov · Jul 23, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 12, 2024
@@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
                                                                                    gather_input_scale);
         }
 
-        transformation_callback(new_gather_node);
+        if (transformation_callback(new_gather_node)) {
+            return false;
+        }
 
         result_nodes.push_back(new_gather_node);
         new_gather_node->set_friendly_name(gather_node->get_friendly_name());

@@ -310,6 +310,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     ov::pass::Manager decompression_handling_manager;
     decompression_handling_manager.set_per_pass_validation(false);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo);
+    const bool useLpt = !defaultPrecisions.empty();
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
     // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
@@ -330,6 +331,13 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
                 // It is necessary to avoid precision conversion for constant node(compressed weights)
                 ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));
+
+                if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
+                                          ov::element::u8,
+                                          ov::element::i8) &&
+                    useLpt) {
+                    return true;
+                }
             }
             return false;
         },
@@ -338,7 +346,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
 
     ov::pass::Manager manager;
     manager.set_per_pass_validation(false);
-    const bool useLpt = !defaultPrecisions.empty();
     if (useLpt)
         CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions);
 

@@ -0,0 +1,130 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/data_utils.hpp"
+#include "openvino/runtime/exec_model_info.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+/*
+ *                    input2
+ *                      |
+ *  Constant(i8)     Softmax
+ *       |            /
+ *    Convert     Multiply
+ *       |          /
+ *    Multiply  Convert   input1(u8/i8)
+ *         \     /          |
+ *          Gather     FakeQuantize
+ *              \       /
+ *               \     /
+ *               MatMul
+ */
+using DisableGatherCompressedForQuantizedModelParams = std::tuple<element::Type, InputShape, InputShape>;
+class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterface<DisableGatherCompressedForQuantizedModelParams>,
+                                                 virtual public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<DisableGatherCompressedForQuantizedModelParams> obj) {
+        element::Type weight_prec;
+        InputShape inputShape1, inputShape2;
+        std::tie(weight_prec, inputShape1, inputShape2) = obj.param;
+        std::ostringstream result;
+        result << "weight_prec=" << weight_prec << "_" << "inputShape1=" << inputShape1 << "_"
+               << "inputShape2=" << inputShape2;
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        element::Type weight_prec;
+        InputShape inputShape1, inputShape2;
+        std::tie(weight_prec, inputShape1, inputShape2) = GetParam();
+
+        // auto input_shape1 = Shape{1, 3, 64, 64};
+        // auto input_shape2 = Shape{32};
+        init_input_shapes({inputShape1, inputShape2});
+
+        targetDevice = test::utils::DEVICE_CPU;
+        auto type = element::f32;
+
+        auto input1 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[0]);
+        auto input2 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[1]);
+
+        auto shared_il = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
+        auto shared_ih = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
+        auto shared_ol = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
+        auto shared_oh = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
+        auto fq = std::make_shared<op::v0::FakeQuantize>(input1, shared_il, shared_ih, shared_ol, shared_oh, 256);
+
+        // Weights
+        auto weights_shape = Shape{64, 64};
+        auto weights_vals = test::utils::generate_float_numbers(shape_size(weights_shape), -1, 1);
+        auto weights = op::v0::Constant::create(weight_prec, weights_shape, weights_vals);
+        auto convert = std::make_shared<op::v0::Convert>(weights, element::f32);
+        auto multiply = std::make_shared<op::v1::Multiply>(convert, op::v0::Constant::create(type, {1, 1}, {0.625}));
+        // Indics
+        auto softmax = std::make_shared<op::v1::Softmax>(input2, 0);
+        auto multiply2 = std::make_shared<op::v1::Multiply>(softmax, op::v0::Constant::create(type, {1}, {64}));
+        auto indics = std::make_shared<op::v0::Convert>(multiply2, element::i64);
+        // Gather
+        auto gather =
+            std::make_shared<op::v8::Gather>(multiply, indics, op::v0::Constant::create(element::i32, Shape{1}, {0}));
+
+        auto matMul = std::make_shared<ov::op::v0::MatMul>(fq, gather, false, true);
+
+        function = std::make_shared<Model>(matMul, ParameterVector{input1, input2});
+    }
+
+    void check_results() {
+        const auto& test_param = GetParam();
+        const auto compressed_weights_precision = std::get<0>(test_param);
+
+        const auto runtime_model = compiledModel.get_runtime_model();
+        const auto result = runtime_model->get_result();
+        const auto matmul = result->get_input_node_shared_ptr(0);
+        if (compressed_weights_precision == element::i8) {
+            EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
+        } else if (compressed_weights_precision == element::u8) {
+            // oneDNN MutMul support precision: Source(u8, s8), Weights(s8)
+            // So reorder will be inserted when weights is not s8.
+            const auto mm_productor = matmul->get_input_node_shared_ptr(1);
+            const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+            EXPECT_EQ(type, "Reorder");
+            EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision);
+        } else {
+            // Keep GatherCompressed, so just check if Gather has 4 inputs.
+            for (const auto& n : runtime_model->get_ordered_ops()) {
+                const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+                if (type == "Gather") {
+                    EXPECT_GE(n->get_input_size(), 4);
+                }
+            }
+        }
+    }
+};
+
+TEST_P(DisableGatherCompressedForQuantizedModel, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    run();
+    check_results();
+}
+
+namespace {
+
+const std::vector<InputShape> inputShapes1 = {{{-1, 3, -1, -1}, {{1, 3, 64, 64}}}};
+const std::vector<InputShape> inputShapes2 = {{{}, {{32}}}};
+const std::vector<element::Type> weightsPrecisions = {element::i8, element::u8, element::u4, element::i4};
+
+INSTANTIATE_TEST_SUITE_P(smoke_DisableGatherCompressedForQuantizedModel_basic,
+                         DisableGatherCompressedForQuantizedModel,
+                         ::testing::Combine(::testing::ValuesIn(weightsPrecisions),
+                                            ::testing::ValuesIn(inputShapes1),
+                                            ::testing::ValuesIn(inputShapes2)),
+                         DisableGatherCompressedForQuantizedModel::getTestCaseName);
+
+}  // namespace
+}  // namespace test
+}  // namespace ov