Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] Disable ConvertGatherToGatherCompressed optimization for quantized models #25478

Merged
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
gather_input_scale);
}

transformation_callback(new_gather_node);
if (transformation_callback(new_gather_node)) {
return false;
}

result_nodes.push_back(new_gather_node);
new_gather_node->set_friendly_name(gather_node->get_friendly_name());
Expand Down
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
ov::pass::Manager decompression_handling_manager;
decompression_handling_manager.set_per_pass_validation(false);
CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo);
const bool useLpt = !defaultPrecisions.empty();
CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
// We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
Expand All @@ -330,6 +331,15 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
// It is necessary to avoid precision conversion for constant node(compressed weights)
ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));

// Prioritize LPT pipeline to handle dequantization part for quantized models as it more optimal in
// general case
if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
ov::element::u8,
ov::element::i8) &&
useLpt) {
return true;
}
}
return false;
},
Expand All @@ -338,7 +348,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis

ov::pass::Manager manager;
manager.set_per_pass_validation(false);
const bool useLpt = !defaultPrecisions.empty();
if (useLpt)
CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "common_test_utils/data_utils.hpp"
#include "common_test_utils/node_builders/constant.hpp"
#include "openvino/runtime/exec_model_info.hpp"
#include "shared_test_classes/base/ov_subgraph.hpp"

namespace ov {
namespace test {
/*
* input2
* |
* Constant(i8) Softmax
* | /
* Convert Multiply
* | /
* Multiply Convert input1(u8/i8)
* \ / |
* Gather FakeQuantize
* \ /
* \ /
* MatMul
*/
using DisableGatherCompressedForQuantizedModelParams = std::tuple<element::Type, InputShape, InputShape>;
class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterface<DisableGatherCompressedForQuantizedModelParams>,
virtual public SubgraphBaseTest {
public:
static std::string getTestCaseName(testing::TestParamInfo<DisableGatherCompressedForQuantizedModelParams> obj) {
element::Type weight_prec;
InputShape inputShape1, inputShape2;
std::tie(weight_prec, inputShape1, inputShape2) = obj.param;
std::ostringstream result;
result << "weight_prec=" << weight_prec << "_" << "inputShape1=" << inputShape1 << "_"
<< "inputShape2=" << inputShape2;
return result.str();
}

protected:
void SetUp() override {
targetDevice = utils::DEVICE_CPU;
element::Type weight_prec;
InputShape inputShape1, inputShape2;
std::tie(weight_prec, inputShape1, inputShape2) = GetParam();

init_input_shapes({inputShape1, inputShape2});

targetDevice = utils::DEVICE_CPU;
auto type = element::f32;

auto input1 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[0]);
auto input2 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[1]);

auto shared_il = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
auto shared_ih = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
auto shared_ol = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
auto shared_oh = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
auto fq = std::make_shared<op::v0::FakeQuantize>(input1, shared_il, shared_ih, shared_ol, shared_oh, 256);

// Weights
auto weights_shape = Shape{64, 64};
auto weights = utils::make_constant(weight_prec, weights_shape, utils::InputGenerateData(-1, 2, 32768));
auto convert = std::make_shared<op::v0::Convert>(weights, element::f32);
auto multiply = std::make_shared<op::v1::Multiply>(convert, op::v0::Constant::create(type, {1, 1}, {0.625}));
// Indics
auto softmax = std::make_shared<op::v1::Softmax>(input2, 0);
auto multiply2 = std::make_shared<op::v1::Multiply>(softmax, op::v0::Constant::create(type, {1}, {64}));
auto indics = std::make_shared<op::v0::Convert>(multiply2, element::i64);
// Gather
auto gather =
std::make_shared<op::v8::Gather>(multiply, indics, op::v0::Constant::create(element::i32, Shape{1}, {0}));

auto matMul = std::make_shared<ov::op::v0::MatMul>(fq, gather, false, true);

function = std::make_shared<Model>(matMul, ParameterVector{input1, input2});
}

void check_results() {
const auto& test_param = GetParam();
const auto compressed_weights_precision = std::get<0>(test_param);

const auto runtime_model = compiledModel.get_runtime_model();
const auto matmul = runtime_model->get_result()->get_input_node_shared_ptr(0);

bool have_gather = false;
bool have_gather_compressed = false;
for (const auto& n : runtime_model->get_ordered_ops()) {
const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
if (type == "Gather") {
// Gather has >=4 inputs means it is GatherCompressed.
if (n->get_input_size() >= 4) {
have_gather_compressed = true;
} else {
have_gather = true;
}
}
}

switch (compressed_weights_precision) {
case element::i8:
EXPECT_TRUE(have_gather);
EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
// FakeQuantize(matmul's input(0))'s output precision is u8
EXPECT_EQ(matmul->get_rt_info().at(ov::exec_model_info::RUNTIME_PRECISION).as<ov::element::Type>(),
element::u8);
break;
case element::u8:
EXPECT_TRUE(have_gather);
// Current oneDNN MutMul official support precision: Source(u8, s8), Weights(s8).
// So reorder will be inserted when weights is not s8, don't need to check matmul's input(1) precision.
break;
case element::u4:
case element::i4:
EXPECT_TRUE(have_gather_compressed);
break;
default:
break;
}
}
};

TEST_P(DisableGatherCompressedForQuantizedModel, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()
run();
check_results();
}

namespace {

const std::vector<InputShape> inputShapes1 = {{{-1, 3, -1, -1}, {{1, 3, 64, 64}}}};
const std::vector<InputShape> inputShapes2 = {{{}, {{32}}}};
const std::vector<element::Type> weightsPrecisions = {element::i8, element::u8, element::u4, element::i4};

INSTANTIATE_TEST_SUITE_P(smoke_DisableGatherCompressedForQuantizedModel_basic,
DisableGatherCompressedForQuantizedModel,
::testing::Combine(::testing::ValuesIn(weightsPrecisions),
::testing::ValuesIn(inputShapes1),
::testing::ValuesIn(inputShapes2)),
DisableGatherCompressedForQuantizedModel::getTestCaseName);

} // namespace
} // namespace test
} // namespace ov
Loading