Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FakeQuantize decomposition #3741

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand Down Expand Up @@ -57,6 +57,8 @@
#include <transformations/convert_precision.hpp>
#include <transformations/init_node_info.hpp>
#include <transformations/rt_info/fused_names_attribute.hpp>
#include <transformations/op_conversions/fq_decomposition.hpp>
#include <transformations/utils/utils.hpp>

#include <ngraph/opsets/opset2.hpp>
#include <ngraph/opsets/opset3.hpp>
Expand All @@ -71,6 +73,8 @@
# include <low_precision/group_convolution.hpp>
# include <low_precision/multiply_to_group_convolution.hpp>

#include "nodes/mkldnn_quantize_node.h"

#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
#if defined(_WIN32) || defined(WIN32)
#include <intrin.h>
Expand Down Expand Up @@ -227,13 +231,22 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
transformer.transform(nGraphFunc);
}

bool has_fake_quantize = ::ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc);

ngraph::pass::Manager legacyManager;

legacyManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
legacyManager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
legacyManager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
// not legacy actually, but it should be the last transformation in the transformation pipeline
legacyManager.register_pass<ngraph::pass::UnrollTensorIterator>();

auto legacyPassConfig = legacyManager.get_pass_config();

legacyPassConfig->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
return !MKLDNNQuantizeNode::isNeedToDecompose(node);
});

legacyPassConfig->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
if (auto mul_op = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
auto add_op = std::dynamic_pointer_cast<const ngraph::opset1::Add>(mul_op->get_input_node_shared_ptr(0));
Expand All @@ -248,15 +261,16 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
return false;
});

legacyManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
legacyPassConfig->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});

legacyManager.run_passes(nGraphFunc);

OV_ITT_TASK_CHAIN(taskChain, MKLDNNPlugin::itt::domains::MKLDNN_LT, "Transformation", "convertFunctionToICNNNetwork");

clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork));
clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork, has_fake_quantize));

OV_ITT_TASK_NEXT(taskChain, "ConvertIOPrecision");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <cpu/x64/jit_generator.hpp>
#include "ie_parallel.hpp"

#include <ngraph/opsets/opset1.hpp>

// Quantization ranges validation is switched off by default in order to avoid regressions on user side
// #define VALIDATE_QUANTIZATION_RANGES

Expand Down Expand Up @@ -1029,7 +1031,7 @@ void MKLDNNQuantizeNode::init() {
float ih = inputHighData[isInputHighBroadcasted ? 0 : i];

#if defined(VALIDATE_QUANTIZATION_RANGES)
if ((il == ih && levels != 2) || std::isnan(il) || std::isnan(ih) || std::isinf(il) || std::isinf(ih)) {
if ((il == ih && levels != 2) || il > ih || std::isnan(il) || std::isnan(ih) || std::isinf(il) || std::isinf(ih)) {
THROW_IE_EXCEPTION << "Quantize layer with name '" << getName() << "' has invalid input quantize ranges: "
<< "inputLow = " << il << ", inputHigh = " << ih;
}
Expand Down Expand Up @@ -1578,6 +1580,33 @@ void MKLDNNQuantizeNode::appendPostOps(mkldnn::post_ops& ops) {
isPostOpDataInitialized = true;
}

bool MKLDNNQuantizeNode::isNeedToDecompose(const std::shared_ptr<const ngraph::Node>& node) {
if (const auto fq = std::dynamic_pointer_cast<const ngraph::opset1::FakeQuantize>(node)) {
for (size_t i = 0; i < fq->get_input_size(); i++) {
if (fq->get_input_shape(i).size() > 5)
return true;
}

for (size_t i = 1; i < fq->get_input_size(); i++) {
size_t count_not_unit_axis = 0;
auto shape = fq->get_input_shape(i);

if (ngraph::shape_size(shape) != 1) {
size_t not_unit_axis = 0;
for (size_t i = 0; i < shape.size(); i++) {
if (shape[i] > 1) {
not_unit_axis = i;
count_not_unit_axis++;
}
}
if (count_not_unit_axis > 1 || not_unit_axis > 1)
return true;
}
}
}
return false;
}

bool MKLDNNQuantizeNode::created() const {
return getType() == Quantize;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand Down Expand Up @@ -113,6 +113,8 @@ class MKLDNNQuantizeNode : public MKLDNNNode {

void appendPostOps(mkldnn::post_ops& ops) override;

static bool isNeedToDecompose(const std::shared_ptr<const ngraph::Node>& node);

private:
void init() override;
std::vector<mkldnn::memory::format_tag> getDataFormats() const;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <transformations_visibility.hpp>
#include <ngraph/pass/graph_rewrite.hpp>

namespace ngraph {
namespace pass {

class TRANSFORMATIONS_API FakeQuantizeDecomposition;

} // namespace pass
} // namespace ngraph

/**
* @ingroup ie_transformation_common_api
* @brief FakeQuantizeDecomposition transformation decomposes FakeQuantize layer.
*
* Expression from specification:
* if x <= min(input_low, input_high):
* output = output_low
* elif x > max(input_low, input_high):
* output = output_high
* else:
* output = round((x - input_low) / (input_high - input_low) * (levels-1)) / (levels-1) * (output_high - output_low) + output_low
*
* expand brackets into round:
* round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
* div on (levels-1) and mult on (output_high - output_low) => mult on (output_high - output_low) / (levels-1)
*
* =>
* round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) + output_low
*
* This transformation doesn't support following cases:
* 1. At least one 'range' input is not Constant
* 2. At least one 'input_low' input value greater or equal than 'input_high' input value
*
*/
mandrono marked this conversation as resolved.
Show resolved Hide resolved

class ngraph::pass::FakeQuantizeDecomposition: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
FakeQuantizeDecomposition();
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "itt.hpp"
#include "transformations/op_conversions/fq_decomposition.hpp"

#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset5.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/builder/autobroadcast.hpp>

#include <numeric>

NGRAPH_RTTI_DEFINITION(ngraph::pass::FakeQuantizeDecomposition, "FakeQuantizeDecomposition", 0);

bool isValidRangesInputs(const std::shared_ptr<ngraph::opset1::FakeQuantize> &fq) {
mandrono marked this conversation as resolved.
Show resolved Hide resolved
auto il = fq->input_value(1);
auto ih = fq->input_value(2);
auto greater_equal = std::make_shared<ngraph::opset1::GreaterEqual>(il, ih);

ngraph::OutputVector result(1);
if (!greater_equal->constant_fold(result, greater_equal->input_values()))
return false;

auto res_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(result[0].get_node_shared_ptr());

const std::vector<bool> comp_result = res_node->cast_vector<bool>();

return !std::any_of(comp_result.begin(), comp_result.end(), [](const bool value) { return value; });
}

ngraph::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
MATCHER_SCOPE(FakeQuantizeDecomposition);
auto data = ngraph::pattern::any_input();
auto il = ngraph::pattern::wrap_type<opset1::Constant>();
auto ih = ngraph::pattern::wrap_type<opset1::Constant>();
auto ol = ngraph::pattern::wrap_type<opset1::Constant>();
auto oh = ngraph::pattern::wrap_type<opset1::Constant>();
auto fake_quantize = ngraph::pattern::wrap_type<ngraph::opset1::FakeQuantize>({data, il, ih, ol, oh});

ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
auto &pattern_to_output = m.get_pattern_value_map();
const auto fake_quantize_node = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(pattern_to_output.at(fake_quantize).get_node_shared_ptr());

if (fake_quantize_node == nullptr || transformation_callback(fake_quantize_node) || !isValidRangesInputs(fake_quantize_node)) {
return false;
}

Output<Node> data{fake_quantize_node->input_value(0)};
const Output<Node> input_low{fake_quantize_node->input_value(1)};
const Output<Node> input_high{fake_quantize_node->input_value(2)};
const Output<Node> output_low{fake_quantize_node->input_value(3)};
const Output<Node> output_high{fake_quantize_node->input_value(4)};
auto input_type = data.get_element_type();

ngraph::NodeVector decomp_ops;
if (input_type != input_low.get_element_type()) {
input_type = input_low.get_element_type();
data = std::make_shared<ngraph::opset1::Convert>(data, input_type);
decomp_ops.push_back(data.get_node_shared_ptr());
}

// if we set input_low or input_high in formula we got output = output_low and output = output_high respectively
// so we just clamp x
const auto max = std::make_shared<ngraph::opset1::Maximum>(data, input_low);
const auto min = std::make_shared<ngraph::opset1::Minimum>(max, input_high);
mandrono marked this conversation as resolved.
Show resolved Hide resolved
decomp_ops.push_back(max);
decomp_ops.push_back(min);

// (levels-1)
const auto levels_minus_one = std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
decomp_ops.push_back(levels_minus_one);
// (input_high - input_low)
const auto subInHighLow = std::make_shared<ngraph::opset1::Subtract>(input_high, input_low);
// (levels-1) / (input_high - input_low)
const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, subInHighLow);
// input_low * (levels-1) / (input_high - input_low)
const auto ish = std::make_shared<ngraph::opset1::Multiply>(input_low, isc);
decomp_ops.push_back(subInHighLow);
decomp_ops.push_back(isc);
decomp_ops.push_back(ish);

// x * (levels-1) / (input_high - input_low)
const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
// x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)
const auto after_ish_apply = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
decomp_ops.push_back(after_isc_apply);
decomp_ops.push_back(after_ish_apply);

// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
const auto round = std::make_shared<ngraph::opset5::Round>(after_ish_apply, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
decomp_ops.push_back(round);

// (output_high - output_low)
const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(output_high, output_low);
// (output_high - output_low) / (levels-1)
const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
decomp_ops.push_back(sub_out_high_low);
decomp_ops.push_back(osc);

// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1)
const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(round, osc);
// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) +
// output_low
std::shared_ptr<Node> result = std::make_shared<ngraph::opset1::Add>(after_osc_apply, output_low);
decomp_ops.push_back(after_osc_apply);
decomp_ops.push_back(result);

if (result->get_output_element_type(0) != fake_quantize_node->get_output_element_type(0)) {
result = std::make_shared<ngraph::opset1::Convert>(result, fake_quantize_node->get_output_element_type(0));
decomp_ops.push_back(result);
}

result->set_friendly_name(m.get_match_root()->get_friendly_name());
ngraph::copy_runtime_info(fake_quantize_node, decomp_ops);
ngraph::replace_node(m.get_match_root(), result);
return true;
};

auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, matcher_name);
register_matcher(m, callback);
}
Loading