Skip to content

Commit

Permalink
FakeQuantize decomposition
Browse files Browse the repository at this point in the history
  • Loading branch information
mandrono committed Jan 25, 2021
1 parent 96b2ffa commit 15b3708
Show file tree
Hide file tree
Showing 9 changed files with 850 additions and 7 deletions.
24 changes: 20 additions & 4 deletions inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand Down Expand Up @@ -57,6 +57,8 @@
#include <transformations/convert_precision.hpp>
#include <transformations/init_node_info.hpp>
#include <transformations/rt_info/fused_names_attribute.hpp>
#include <transformations/op_conversions/fq_decomposition.hpp>
#include <transformations/utils/utils.hpp>

#include <ngraph/opsets/opset2.hpp>
#include <ngraph/opsets/opset3.hpp>
Expand All @@ -71,6 +73,8 @@
# include <low_precision/group_convolution.hpp>
# include <low_precision/multiply_to_group_convolution.hpp>

#include "nodes/mkldnn_quantize_node.h"

#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
#if defined(_WIN32) || defined(WIN32)
#include <intrin.h>
Expand Down Expand Up @@ -226,13 +230,24 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
transformer.transform(nGraphFunc);
}

bool keep_constant_inputs = ::ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc);

ngraph::pass::Manager legacyManager;

legacyManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
legacyManager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
legacyManager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
// not legacy actually, but it should be the last transformation in the transformation pipeline
legacyManager.register_pass<ngraph::pass::UnrollTensorIterator>();

auto legacyPassConfig = legacyManager.get_pass_config();

legacyPassConfig->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
if (const auto fq = std::dynamic_pointer_cast<const ngraph::opset1::FakeQuantize>(node)) {
return !MKLDNNQuantizeNode::isNeedToDecompos(fq);
}
return true;
});

legacyPassConfig->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
if (auto mul_op = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
auto add_op = std::dynamic_pointer_cast<const ngraph::opset1::Add>(mul_op->get_input_node_shared_ptr(0));
Expand All @@ -247,15 +262,16 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
return false;
});

legacyManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
legacyPassConfig->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});

legacyManager.run_passes(nGraphFunc);

OV_ITT_TASK_CHAIN(taskChain, MKLDNNPlugin::itt::domains::MKLDNN_LT, "Transformation", "convertFunctionToICNNNetwork");

clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork));
clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork, keep_constant_inputs));

OV_ITT_TASK_NEXT(taskChain, "ConvertIOPrecision");

Expand Down
64 changes: 64 additions & 0 deletions inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,11 @@ void MKLDNNQuantizeNode::init() {
}
#endif

if (il > ih) {
THROW_IE_EXCEPTION << "Quantize layer with name '" << getName() << "' has unsupported input quantize ranges inputLow > inputHigh: "
<< il << " > " << ih;
}

inputScale[i] = (levels - 1) / (ih - il);
inputShift[i] = -il * (levels - 1) / (ih - il);
}
Expand Down Expand Up @@ -402,6 +407,38 @@ void MKLDNNQuantizeNode::initSupportedPrimitiveDescriptors() {
}
}

void MKLDNNQuantizeNode::filterSupportedPrimitiveDescriptors() {
MKLDNNNode::filterSupportedPrimitiveDescriptors();
filterSupportedDescriptors();
}

void MKLDNNQuantizeNode::filterSupportedDescriptors() {
if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty()) {
if (inputMemoryFormatsFilter.size() > 1 || outputMemoryFormatsFilter.size() > 1) {
THROW_IE_EXCEPTION << "Incorrect number of input or output memory formats for Quantize node";
}
auto itd = descs.begin();
while (itd != descs.end()) {
bool isSuitableDesc = true;
if (!inputMemoryFormatsFilter.empty()) {
auto src_fmt = std::shared_ptr<mkldnn::quantization_forward::desc>(*itd)->data.src_desc.format;
if (src_fmt != inputMemoryFormatsFilter[0])
isSuitableDesc = false;
}
if (!outputMemoryFormatsFilter.empty()) {
auto dst_fmt = std::shared_ptr<mkldnn::quantization_forward::desc>(*itd)->data.dst_desc.format;
if (dst_fmt != outputMemoryFormatsFilter[0])
isSuitableDesc = false;
}
if (!isSuitableDesc) {
itd = descs.erase(itd);
} else {
itd++;
}
}
}
}

void MKLDNNQuantizeNode::createPrimitive() {
if (prim)
return;
Expand Down Expand Up @@ -563,6 +600,33 @@ void MKLDNNQuantizeNode::appendPostOps(mkldnn::post_ops& ops) {
ops.append_quantization(quantizeAlgorithm, &cropLowData, &cropHighData, &inputScaleData, &inputShiftData, &outputScaleData, &outputShiftData);
}

bool MKLDNNQuantizeNode::isNeedToDecompos(const std::shared_ptr<const ngraph::opset1::FakeQuantize> &fq) {
for (size_t i = 0; i < fq->get_input_size(); i++) {
if (fq->get_input_node_shared_ptr(0)->get_shape().size() > 5)
return true;
}

for (size_t i = 1; i < fq->get_input_size(); i++) {
auto node = fq->get_input_node_shared_ptr(i);

size_t count_not_unit_axis = 0;
auto shape = node->get_shape();
if (ngraph::shape_size(shape) != 1) {
size_t not_unit_axis = 0;
for (size_t i = 0; i < shape.size(); i++) {
if (shape[i] > 1) {
not_unit_axis = i;
count_not_unit_axis++;
}
}
if (count_not_unit_axis > 1 || not_unit_axis > 1)
return true;
}
}

return false;
}

bool MKLDNNQuantizeNode::created() const {
return getType() == Quantize;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -11,6 +11,7 @@
#include <vector>
#include <utility>
#include <primitive_attr.hpp>
#include <ngraph/opsets/opset1.hpp>

namespace MKLDNNPlugin {

Expand All @@ -25,6 +26,9 @@ class MKLDNNQuantizeNode : public MKLDNNNode {
bool created() const override;
void execute(mkldnn::stream strm) override;

void filterSupportedPrimitiveDescriptors() override;
void filterSupportedDescriptors();

size_t getAxis() const { return axis; }

bool isBinarization() const { return quantizeAlgorithm == mkldnn::algorithm::binarization_depthwise; }
Expand Down Expand Up @@ -59,6 +63,8 @@ class MKLDNNQuantizeNode : public MKLDNNNode {

void appendPostOps(mkldnn::post_ops& ops) override;

static bool isNeedToDecompos(const std::shared_ptr<const ngraph::opset1::FakeQuantize> &fq);

private:
void init() override;
std::vector<mkldnn::memory::format> getDataFormats() const;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <transformations_visibility.hpp>
#include <ngraph/pass/graph_rewrite.hpp>

namespace ngraph {
namespace pass {

class TRANSFORMATIONS_API FakeQuantizeDecomposition;

} // namespace pass
} // namespace ngraph

/**
* @ingroup ie_transformation_common_api
* @brief FakeQuantizeDecomposition transformation into sub-graph if:
* 1. input nodes have rank > 5
* 2. 'range' nodes have more than one dimension != 1 or this dimension is not batch or channel
* 'range' nodes should be Constant and IL values should be less than IH
*/
class ngraph::pass::FakeQuantizeDecomposition: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
FakeQuantizeDecomposition();
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "itt.hpp"
#include "transformations/op_conversions/fq_decomposition.hpp"

#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset5.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/builder/autobroadcast.hpp>

#include <numeric>

NGRAPH_RTTI_DEFINITION(ngraph::pass::FakeQuantizeDecomposition, "FakeQuantizeDecomposition", 0);

bool isValidRangesInputs(const std::shared_ptr<ngraph::opset1::FakeQuantize> &fq) {
for (size_t i = 1; i < fq->get_input_size(); i++) {
if (!std::dynamic_pointer_cast<const ngraph::opset1::Constant>(fq->get_input_node_shared_ptr(i)))
return false;
}

auto il_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(fq->get_input_node_shared_ptr(1));
auto ih_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(fq->get_input_node_shared_ptr(2));

auto broadcasted_shape = ngraph::builder::get_numpy_broadcast_shapes({il_node->get_shape(), ih_node->get_shape()});
auto result_shape = broadcasted_shape.first;
auto il_padded_shape = broadcasted_shape.second.at(0);
auto ih_padded_shape = broadcasted_shape.second.at(1);

auto get_strides = [](const ngraph::Shape& in_shape, const ngraph::Shape& res_shape) -> std::vector<size_t> {
std::vector<size_t> strides(res_shape.size());
size_t k = 1;
for (int i = in_shape.size() - 1; i >= 0; i--) {
strides[i] = (in_shape[i] == res_shape[i]) ? k : 0;
k *= in_shape[i];
}
return strides;
};

auto il_strides = get_strides(il_padded_shape, result_shape);
auto ih_strides = get_strides(ih_padded_shape, result_shape);

auto get_index = [](const ngraph::Coordinate& in_coord, const std::vector<size_t>& strides) -> size_t {
size_t index = 0;
for (size_t i = 0; i < in_coord.size(); i++) {
index += in_coord[i]*strides[i];
}
return index;
};

const std::vector<float> il = il_node->cast_vector<float>();
const std::vector<float> ih = ih_node->cast_vector<float>();

auto step = [](ngraph::Coordinate& _iter, const ngraph::Shape& _dims) {
auto iter = _iter.rbegin();
auto dims = _dims.rbegin();

while (iter != _iter.rend()) {
*iter = (*iter + 1) % *dims;
if (*iter != 0) {
break;
}
++iter;
++dims;
}
};

const size_t work_amount = std::accumulate(result_shape.begin(), result_shape.end(), 1, std::multiplies<size_t>());
ngraph::Coordinate iter(result_shape.size(), 0);
for (size_t i = 0; i < work_amount; i++) {
if (il[get_index(iter, il_strides)] >= ih[get_index(iter, ih_strides)])
return false;
step(iter, result_shape);
}

return true;
}

/**
* Expression from specification:
* if x <= min(input_low, input_high):
* output = output_low
* elif x > max(input_low, input_high):
* output = output_high
* else:
* output = round((x - input_low) / (input_high - input_low) * (levels-1)) / (levels-1) * (output_high - output_low) + output_low
*
* expand brackets into round:
* round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
* div on (levels-1) and mult on (output_high - output_low) => mult on (output_high - output_low) / (levels-1)
*
* =>
* round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) + output_low
*/
ngraph::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
MATCHER_SCOPE(FakeQuantizeDecomposition);
auto fake_quantize = ngraph::pattern::wrap_type<ngraph::opset1::FakeQuantize>();

ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
auto &pattern_to_output = m.get_pattern_value_map();
const auto fake_quantize_node = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(pattern_to_output.at(fake_quantize).get_node_shared_ptr());

if (fake_quantize_node == nullptr || transformation_callback(fake_quantize_node) || !isValidRangesInputs(fake_quantize_node)) {
return false;
}

Output<Node> data{fake_quantize_node->input_value(0)};
const Output<Node> input_low{fake_quantize_node->input_value(1)};
const Output<Node> input_high{fake_quantize_node->input_value(2)};
const Output<Node> output_low{fake_quantize_node->input_value(3)};
const Output<Node> output_high{fake_quantize_node->input_value(4)};
auto input_type = data.get_element_type();

ngraph::NodeVector decomp_ops;
if (input_type != input_low.get_element_type()) {
input_type = input_low.get_element_type();
data = std::make_shared<ngraph::opset1::Convert>(data, input_type);
decomp_ops.push_back(data.get_node_shared_ptr());
}

// if we set input_low or input_high in formula we got output = output_low and output = output_high respectively
// so we just clamp x
const auto max = std::make_shared<ngraph::opset1::Maximum>(data, input_low);
const auto min = std::make_shared<ngraph::opset1::Minimum>(max, input_high);
decomp_ops.push_back(max);
decomp_ops.push_back(min);

// (levels-1)
const auto levels_minus_one = std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
decomp_ops.push_back(levels_minus_one);
// (input_high - input_low)
const auto subInHighLow = std::make_shared<ngraph::opset1::Subtract>(input_high, input_low);
// (levels-1) / (input_high - input_low)
const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, subInHighLow);
// input_low * (levels-1) / (input_high - input_low)
const auto ish = std::make_shared<ngraph::opset1::Multiply>(input_low, isc);
decomp_ops.push_back(subInHighLow);
decomp_ops.push_back(isc);
decomp_ops.push_back(ish);

// x * (levels-1) / (input_high - input_low)
const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
// x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)
const auto after_ish_apply = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
decomp_ops.push_back(after_isc_apply);
decomp_ops.push_back(after_ish_apply);

// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
const auto round = std::make_shared<ngraph::opset5::Round>(after_ish_apply, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
decomp_ops.push_back(round);

// (output_high - output_low)
const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(output_high, output_low);
// (output_high - output_low) / (levels-1)
const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
decomp_ops.push_back(sub_out_high_low);
decomp_ops.push_back(osc);

// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1)
const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(round, osc);
// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) +
// output_low
std::shared_ptr<Node> result = std::make_shared<ngraph::opset1::Add>(after_osc_apply, output_low);
decomp_ops.push_back(after_osc_apply);
decomp_ops.push_back(result);

if (result->get_output_element_type(0) != fake_quantize_node->get_output_element_type(0)) {
result = std::make_shared<ngraph::opset1::Convert>(result, fake_quantize_node->get_output_element_type(0));
decomp_ops.push_back(result);
}

result->set_friendly_name(m.get_match_root()->get_friendly_name());
ngraph::copy_runtime_info(fake_quantize_node, decomp_ops);
ngraph::replace_node(m.get_match_root(), result);
return true;
};

auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, matcher_name);
register_matcher(m, callback);
}
Loading

0 comments on commit 15b3708

Please sign in to comment.