Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] Disable ConvertGatherToGatherCompressed optimization for quantized models #25478

Merged
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
gather_input_scale);
}

transformation_callback(new_gather_node);
if (transformation_callback(new_gather_node)) {
return false;
}

result_nodes.push_back(new_gather_node);
new_gather_node->set_friendly_name(gather_node->get_friendly_name());
Expand Down
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
ov::pass::Manager decompression_handling_manager;
decompression_handling_manager.set_per_pass_validation(false);
CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo);
const bool useLpt = !defaultPrecisions.empty();
CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
// We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
Expand All @@ -330,6 +331,13 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
if (ov::is_type<ov::op::internal::GatherCompressed>(node)) {
// It is necessary to avoid precision conversion for constant node(compressed weights)
ov::enable_keep_const_precision(node->get_input_node_shared_ptr(0));

if (ov::intel_cpu::one_of(node->get_input_node_shared_ptr(0)->get_element_type(),
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
ov::element::u8,
ov::element::i8) &&
useLpt) {
return true;
}
}
return false;
},
Expand All @@ -338,7 +346,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis

ov::pass::Manager manager;
manager.set_per_pass_validation(false);
const bool useLpt = !defaultPrecisions.empty();
if (useLpt)
CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// Copyright (C) 2022 Intel Corporation
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
// SPDX-License-Identifier: Apache-2.0
//

#include "common_test_utils/data_utils.hpp"
#include "openvino/runtime/exec_model_info.hpp"
#include "shared_test_classes/base/ov_subgraph.hpp"

namespace ov {
namespace test {
/*
* input2
* |
* Constant(i8) Softmax
* | /
* Convert Multiply
* | /
* Multiply Convert input1(u8/i8)
* \ / |
* Gather FakeQuantize
* \ /
* \ /
* MatMul
*/
using DisableGatherCompressedForQuantizedModelParams = std::tuple<element::Type, InputShape, InputShape>;
class DisableGatherCompressedForQuantizedModel : public testing::WithParamInterface<DisableGatherCompressedForQuantizedModelParams>,
virtual public SubgraphBaseTest {
public:
static std::string getTestCaseName(testing::TestParamInfo<DisableGatherCompressedForQuantizedModelParams> obj) {
element::Type weight_prec;
InputShape inputShape1, inputShape2;
std::tie(weight_prec, inputShape1, inputShape2) = obj.param;
std::ostringstream result;
result << "weight_prec=" << weight_prec << "_" << "inputShape1=" << inputShape1 << "_"
<< "inputShape2=" << inputShape2;
return result.str();
}

protected:
void SetUp() override {
targetDevice = ov::test::utils::DEVICE_CPU;
element::Type weight_prec;
InputShape inputShape1, inputShape2;
std::tie(weight_prec, inputShape1, inputShape2) = GetParam();

// auto input_shape1 = Shape{1, 3, 64, 64};
// auto input_shape2 = Shape{32};
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
init_input_shapes({inputShape1, inputShape2});

targetDevice = test::utils::DEVICE_CPU;
auto type = element::f32;

auto input1 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[0]);
auto input2 = std::make_shared<op::v0::Parameter>(type, inputDynamicShapes[1]);

auto shared_il = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
auto shared_ih = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
auto shared_ol = op::v0::Constant::create(type, {1, 1, 1, 1}, {0.f});
auto shared_oh = op::v0::Constant::create(type, {1, 1, 1, 1}, {12.5f});
auto fq = std::make_shared<op::v0::FakeQuantize>(input1, shared_il, shared_ih, shared_ol, shared_oh, 256);

// Weights
auto weights_shape = Shape{64, 64};
auto weights_vals = test::utils::generate_float_numbers(shape_size(weights_shape), -1, 1);
auto weights = op::v0::Constant::create(weight_prec, weights_shape, weights_vals);
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
auto convert = std::make_shared<op::v0::Convert>(weights, element::f32);
auto multiply = std::make_shared<op::v1::Multiply>(convert, op::v0::Constant::create(type, {1, 1}, {0.625}));
// Indics
auto softmax = std::make_shared<op::v1::Softmax>(input2, 0);
auto multiply2 = std::make_shared<op::v1::Multiply>(softmax, op::v0::Constant::create(type, {1}, {64}));
auto indics = std::make_shared<op::v0::Convert>(multiply2, element::i64);
// Gather
auto gather =
std::make_shared<op::v8::Gather>(multiply, indics, op::v0::Constant::create(element::i32, Shape{1}, {0}));

auto matMul = std::make_shared<ov::op::v0::MatMul>(fq, gather, false, true);

function = std::make_shared<Model>(matMul, ParameterVector{input1, input2});
}

void check_results() {
const auto& test_param = GetParam();
const auto compressed_weights_precision = std::get<0>(test_param);

const auto runtime_model = compiledModel.get_runtime_model();
const auto result = runtime_model->get_result();
const auto matmul = result->get_input_node_shared_ptr(0);
if (compressed_weights_precision == element::i8) {
EXPECT_EQ(matmul->get_input_element_type(1), element::i8);
} else if (compressed_weights_precision == element::u8) {
// oneDNN MutMul support precision: Source(u8, s8), Weights(s8)
// So reorder will be inserted when weights is not s8.
const auto mm_productor = matmul->get_input_node_shared_ptr(1);
xipingyan marked this conversation as resolved.
Show resolved Hide resolved
const auto type = mm_productor->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
EXPECT_EQ(type, "Reorder");
EXPECT_EQ(mm_productor->get_input_element_type(0), compressed_weights_precision);
} else {
// Keep GatherCompressed, so just check if Gather has 4 inputs.
for (const auto& n : runtime_model->get_ordered_ops()) {
const auto type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
if (type == "Gather") {
EXPECT_GE(n->get_input_size(), 4);
}
}
}
}
};

TEST_P(DisableGatherCompressedForQuantizedModel, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()
run();
check_results();
}

namespace {

const std::vector<InputShape> inputShapes1 = {{{-1, 3, -1, -1}, {{1, 3, 64, 64}}}};
const std::vector<InputShape> inputShapes2 = {{{}, {{32}}}};
const std::vector<element::Type> weightsPrecisions = {element::i8, element::u8, element::u4, element::i4};

INSTANTIATE_TEST_SUITE_P(smoke_DisableGatherCompressedForQuantizedModel_basic,
DisableGatherCompressedForQuantizedModel,
::testing::Combine(::testing::ValuesIn(weightsPrecisions),
::testing::ValuesIn(inputShapes1),
::testing::ValuesIn(inputShapes2)),
DisableGatherCompressedForQuantizedModel::getTestCaseName);

} // namespace
} // namespace test
} // namespace ov
Loading