openvinotoolkit · dmitry-gorokhov · Feb 2, 2021 · Dec 22, 2020
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -57,6 +57,8 @@
 #include <transformations/convert_precision.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/rt_info/fused_names_attribute.hpp>
+#include <transformations/op_conversions/fq_decomposition.hpp>
+#include <transformations/utils/utils.hpp>
 
 #include <ngraph/opsets/opset2.hpp>
 #include <ngraph/opsets/opset3.hpp>
@@ -71,6 +73,8 @@
 # include <low_precision/group_convolution.hpp>
 # include <low_precision/multiply_to_group_convolution.hpp>
 
+#include "nodes/mkldnn_quantize_node.h"
+
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
 #if defined(_WIN32) || defined(WIN32)
 #include <intrin.h>
@@ -227,13 +231,22 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
         transformer.transform(nGraphFunc);
     }
 
+    bool has_fake_quantize = ::ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc);
+
     ngraph::pass::Manager legacyManager;
+
+    legacyManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
     legacyManager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
     legacyManager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
     // not legacy actually, but it should be the last transformation in the transformation pipeline
     legacyManager.register_pass<ngraph::pass::UnrollTensorIterator>();
 
     auto legacyPassConfig = legacyManager.get_pass_config();
+
+    legacyPassConfig->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
+        return !MKLDNNQuantizeNode::isNeedToDecompose(node);
+    });
+
     legacyPassConfig->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
         if (auto mul_op = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
             auto add_op = std::dynamic_pointer_cast<const ngraph::opset1::Add>(mul_op->get_input_node_shared_ptr(0));
@@ -248,15 +261,16 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
         return false;
     });
 
-    legacyManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
+    legacyPassConfig->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
         // UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
         return node->get_rt_info().count("UNROLL_TI") == 0;
     });
+
     legacyManager.run_passes(nGraphFunc);
 
     OV_ITT_TASK_CHAIN(taskChain, MKLDNNPlugin::itt::domains::MKLDNN_LT, "Transformation", "convertFunctionToICNNNetwork");
 
-    clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork));
+    clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork, has_fake_quantize));
 
     OV_ITT_TASK_NEXT(taskChain, "ConvertIOPrecision");
 

@@ -18,6 +18,8 @@
 #include <cpu/x64/jit_generator.hpp>
 #include "ie_parallel.hpp"
 
+#include <ngraph/opsets/opset1.hpp>
+
 // Quantization ranges validation is switched off by default in order to avoid regressions on user side
 // #define VALIDATE_QUANTIZATION_RANGES
 
@@ -1029,7 +1031,7 @@ void MKLDNNQuantizeNode::init() {
             float ih = inputHighData[isInputHighBroadcasted ? 0 : i];
 
 #if defined(VALIDATE_QUANTIZATION_RANGES)
-            if ((il == ih && levels != 2) || std::isnan(il) || std::isnan(ih) || std::isinf(il) || std::isinf(ih)) {
+            if ((il == ih && levels != 2) || il > ih || std::isnan(il) || std::isnan(ih) || std::isinf(il) || std::isinf(ih)) {
                 THROW_IE_EXCEPTION << "Quantize layer with name '" << getName() << "' has invalid input quantize ranges: "
                                    << "inputLow = " << il << ", inputHigh = " << ih;
             }
@@ -1578,6 +1580,33 @@ void MKLDNNQuantizeNode::appendPostOps(mkldnn::post_ops& ops) {
         isPostOpDataInitialized = true;
 }
 
+bool MKLDNNQuantizeNode::isNeedToDecompose(const std::shared_ptr<const ngraph::Node>& node) {
+    if (const auto fq = std::dynamic_pointer_cast<const ngraph::opset1::FakeQuantize>(node)) {
+        for (size_t i = 0; i < fq->get_input_size(); i++) {
+            if (fq->get_input_shape(i).size() > 5)
+                return true;
+        }
+
+        for (size_t i = 1; i < fq->get_input_size(); i++) {
+            size_t count_not_unit_axis = 0;
+            auto shape = fq->get_input_shape(i);
+
+            if (ngraph::shape_size(shape) != 1) {
+                size_t not_unit_axis = 0;
+                for (size_t i = 0; i < shape.size(); i++) {
+                    if (shape[i] > 1) {
+                        not_unit_axis = i;
+                        count_not_unit_axis++;
+                    }
+                }
+                if (count_not_unit_axis > 1 || not_unit_axis > 1)
+                    return true;
+            }
+        }
+    }
+    return false;
+}
+
 bool MKLDNNQuantizeNode::created() const {
     return getType() == Quantize;
 }

@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -113,6 +113,8 @@ class MKLDNNQuantizeNode : public MKLDNNNode {
 
     void appendPostOps(mkldnn::post_ops& ops) override;
 
+    static bool isNeedToDecompose(const std::shared_ptr<const ngraph::Node>& node);
+
 private:
     void init() override;
     std::vector<mkldnn::memory::format_tag> getDataFormats() const;

@@ -0,0 +1,47 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <transformations_visibility.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API FakeQuantizeDecomposition;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief FakeQuantizeDecomposition transformation decomposes FakeQuantize layer.
+ *
+ * Expression from specification:
+ * if x <= min(input_low, input_high):
+ *   output = output_low
+ * elif x > max(input_low, input_high):
+ *   output = output_high
+ * else:
+ *   output = round((x - input_low) / (input_high - input_low) * (levels-1)) / (levels-1) * (output_high - output_low) + output_low
+ *
+ * expand brackets into round:
+ * round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
+ * div on (levels-1) and mult on (output_high - output_low) => mult on (output_high - output_low) / (levels-1)
+ *
+ *  =>
+ * round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) + output_low
+ *
+ * This transformation doesn't support following cases:
+ * 1. At least one 'range' input is not Constant
+ * 2. At least one 'input_low' input value greater or equal than 'input_high' input value
+ *
+ */
+
+class ngraph::pass::FakeQuantizeDecomposition: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    FakeQuantizeDecomposition();
+};
@@ -0,0 +1,124 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include "transformations/op_conversions/fq_decomposition.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/builder/autobroadcast.hpp>
+
+#include <numeric>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::FakeQuantizeDecomposition, "FakeQuantizeDecomposition", 0);
+
+bool isValidRangesInputs(const std::shared_ptr<ngraph::opset1::FakeQuantize> &fq) {
+    auto il = fq->input_value(1);
+    auto ih = fq->input_value(2);
+    auto greater_equal = std::make_shared<ngraph::opset1::GreaterEqual>(il, ih);
+
+    ngraph::OutputVector result(1);
+    if (!greater_equal->constant_fold(result, greater_equal->input_values()))
+        return false;
+
+    auto res_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(result[0].get_node_shared_ptr());
+
+    const std::vector<bool> comp_result = res_node->cast_vector<bool>();
+
+    return !std::any_of(comp_result.begin(), comp_result.end(), [](const bool value) { return value; });
+}
+
+ngraph::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
+    MATCHER_SCOPE(FakeQuantizeDecomposition);
+    auto data = ngraph::pattern::any_input();
+    auto il = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto ih = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto ol = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto oh = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto fake_quantize = ngraph::pattern::wrap_type<ngraph::opset1::FakeQuantize>({data, il, ih, ol, oh});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        auto &pattern_to_output = m.get_pattern_value_map();
+        const auto fake_quantize_node = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(pattern_to_output.at(fake_quantize).get_node_shared_ptr());
+
+        if (fake_quantize_node == nullptr || transformation_callback(fake_quantize_node) || !isValidRangesInputs(fake_quantize_node)) {
+            return false;
+        }
+
+        Output<Node> data{fake_quantize_node->input_value(0)};
+        const Output<Node> input_low{fake_quantize_node->input_value(1)};
+        const Output<Node> input_high{fake_quantize_node->input_value(2)};
+        const Output<Node> output_low{fake_quantize_node->input_value(3)};
+        const Output<Node> output_high{fake_quantize_node->input_value(4)};
+        auto input_type = data.get_element_type();
+
+        ngraph::NodeVector decomp_ops;
+        if (input_type != input_low.get_element_type()) {
+            input_type = input_low.get_element_type();
+            data = std::make_shared<ngraph::opset1::Convert>(data, input_type);
+            decomp_ops.push_back(data.get_node_shared_ptr());
+        }
+
+        // if we set input_low or input_high in formula we got output = output_low and output = output_high respectively
+        // so we just clamp x
+        const auto max = std::make_shared<ngraph::opset1::Maximum>(data, input_low);
+        const auto min = std::make_shared<ngraph::opset1::Minimum>(max, input_high);
+        decomp_ops.push_back(max);
+        decomp_ops.push_back(min);
+
+        // (levels-1)
+        const auto levels_minus_one = std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
+        decomp_ops.push_back(levels_minus_one);
+        // (input_high - input_low)
+        const auto subInHighLow = std::make_shared<ngraph::opset1::Subtract>(input_high, input_low);
+        // (levels-1) / (input_high - input_low)
+        const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, subInHighLow);
+        // input_low * (levels-1) / (input_high - input_low)
+        const auto ish = std::make_shared<ngraph::opset1::Multiply>(input_low, isc);
+        decomp_ops.push_back(subInHighLow);
+        decomp_ops.push_back(isc);
+        decomp_ops.push_back(ish);
+
+        // x * (levels-1) / (input_high - input_low)
+        const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
+        // x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)
+        const auto after_ish_apply = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
+        decomp_ops.push_back(after_isc_apply);
+        decomp_ops.push_back(after_ish_apply);
+
+        // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
+        const auto round = std::make_shared<ngraph::opset5::Round>(after_ish_apply, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
+        decomp_ops.push_back(round);
+
+        // (output_high - output_low)
+        const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(output_high, output_low);
+        // (output_high - output_low) / (levels-1)
+        const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
+        decomp_ops.push_back(sub_out_high_low);
+        decomp_ops.push_back(osc);
+
+        // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1)
+        const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(round, osc);
+        // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) +
+        // output_low
+        std::shared_ptr<Node> result = std::make_shared<ngraph::opset1::Add>(after_osc_apply, output_low);
+        decomp_ops.push_back(after_osc_apply);
+        decomp_ops.push_back(result);
+
+        if (result->get_output_element_type(0) != fake_quantize_node->get_output_element_type(0)) {
+            result = std::make_shared<ngraph::opset1::Convert>(result, fake_quantize_node->get_output_element_type(0));
+            decomp_ops.push_back(result);
+        }
+
+        result->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ngraph::copy_runtime_info(fake_quantize_node, decomp_ops);
+        ngraph::replace_node(m.get_match_root(), result);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, matcher_name);
+    register_matcher(m, callback);
+}