Snippets: precision propagation (#14996)

openvinotoolkit · Mar 23, 2023 · 087b10f · 087b10f
1 parent 5fa95ff
commit 087b10f
Show file tree

Hide file tree

Showing 48 changed files with 2,066 additions and 327 deletions.
diff --git a/src/bindings/python/tests/__init__.py b/src/bindings/python/tests/__init__.py
@@ -117,7 +117,6 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True):
 
 xfail_issue_63033 = xfail_test(reason="BatchNormalization: Training mode is not supported")
 xfail_issue_63036 = xfail_test(reason="Changes in ConvTranspose padding")
-xfail_issue_63039 = xfail_test(reason="Result mismatches with UINT8 operations")
 xfail_issue_63043 = xfail_test(reason="Recurrent node expects constants as W, R, B inputs.")
 
 skip_rng_tests = pytest.mark.skip(reason="Tests use random number generator with no seed.")

diff --git a/src/bindings/python/tests/test_onnx/test_backend.py b/src/bindings/python/tests/test_onnx/test_backend.py
@@ -37,7 +37,6 @@
     xfail_issue_58033,
     xfail_issue_63033,
     xfail_issue_63036,
-    xfail_issue_63039,
     xfail_issue_63043,
     xfail_issue_63137,
     xfail_issue_63138,
@@ -278,10 +277,6 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
         "OnnxBackendNodeModelTest.test_batchnorm_example_training_mode_cpu",
     ),
     (xfail_issue_63036, "OnnxBackendNodeModelTest.test_convtranspose_autopad_same_cpu"),
-    (
-        xfail_issue_63039,
-        "OnnxBackendNodeModelTest.test_div_uint8_cpu",
-    ),
     (
         xfail_issue_63043,
         "OnnxBackendNodeModelTest.test_gru_batchwise_cpu",

diff --git a/src/bindings/python/tests_compatibility/__init__.py b/src/bindings/python/tests_compatibility/__init__.py
@@ -122,7 +122,6 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True):
 
 xfail_issue_63033 = xfail_test(reason="BatchNormalization: Training mode is not supported")
 xfail_issue_63036 = xfail_test(reason="Changes in ConvTranspose padding")
-xfail_issue_63039 = xfail_test(reason="Result mismatches with UINT8 operations")
 xfail_issue_63043 = xfail_test(reason="Recurrent node expects constants as W, R, B inputs.")
 
 skip_rng_tests = pytest.mark.skip(reason="Tests use random number generator with no seed.")

diff --git a/src/bindings/python/tests_compatibility/test_onnx/test_backend.py b/src/bindings/python/tests_compatibility/test_onnx/test_backend.py
@@ -37,7 +37,6 @@
     xfail_issue_58033,
     xfail_issue_63033,
     xfail_issue_63036,
-    xfail_issue_63039,
     xfail_issue_63043,
     xfail_issue_63137,
     xfail_issue_63138,
@@ -282,10 +281,6 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
         "OnnxBackendNodeModelTest.test_batchnorm_example_training_mode_cpu",
     ),
     (xfail_issue_63036, "OnnxBackendNodeModelTest.test_convtranspose_autopad_same_cpu"),
-    (
-        xfail_issue_63039,
-        "OnnxBackendNodeModelTest.test_div_uint8_cpu",
-    ),
     (
         xfail_issue_63043,
         "OnnxBackendNodeModelTest.test_gru_batchwise_cpu",

diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -16,6 +16,8 @@ namespace snippets {
 
 auto getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo;
 
+typedef std::pair<std::function<std::shared_ptr<Emitter>(const std::shared_ptr<ngraph::Node>&)>,
+                  std::function<std::set<std::vector<element::Type>>(const std::shared_ptr<ngraph::Node>&)>> jitters_value;
 /**
  * @interface TargetMachine
  * @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emitters
@@ -51,7 +53,16 @@ class TargetMachine {
         if (jitter == jitters.end()) {
             throw ngraph_error(std::string("Target code emitter is not available for ") + type.name + " operation.");
         }
-        return jitter->second;
+        return jitter->second.first;
+    }
+
+    std::function<std::set<std::vector<element::Type>>(const std::shared_ptr<ngraph::Node>&)>
+        get_supported_precisions(const ngraph::DiscreteTypeInfo type) const {
+        auto jitter = jitters.find(type);
+        if (jitter == jitters.end()) {
+            throw ngraph_error(std::string("Target code emitter is not available for ") + type.name + " operation.");
+        }
+        return jitter->second.second;
     }
 
     /**
@@ -64,7 +75,7 @@ class TargetMachine {
     virtual ~TargetMachine() = default;
 
 protected:
-    std::map<const ngraph::DiscreteTypeInfo, std::function<std::shared_ptr<Emitter>(std::shared_ptr<ngraph::Node>)>> jitters;
+    std::map<const ngraph::DiscreteTypeInfo, jitters_value> jitters;
 };
 
 /**

diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -101,11 +101,17 @@ class Subgraph : public ov::op::util::SubGraphOp {
     bool is_quantized() const { return config.m_is_quantized; }
     bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; }
     bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; }
-
-    snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
+    snippets::Schedule generate(const BlockedShapeVector& output_shapes,
+                                const BlockedShapeVector& input_shapes,
+                                ngraph::pass::Manager& pre_dialect,
+                                ngraph::pass::Manager& post_dialect,
+                                ngraph::pass::Manager& post_precision,
                                 const void* compile_params = nullptr);
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr);
-    snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
+    snippets::Schedule generate(ngraph::pass::Manager& pre_dialect,
+                                ngraph::pass::Manager& post_dialect,
+                                ngraph::pass::Manager& post_precision,
+                                const void* compile_params = nullptr);
     snippets::Schedule generate(const void* compile_params = nullptr);
     ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
     std::vector<PartialShape> reshape_body(const std::vector<PartialShape>& input_shapes);
@@ -132,6 +138,8 @@ class Subgraph : public ov::op::util::SubGraphOp {
     // This check returns True if Constant op which is input of this op should be inside Subgraph body
     static auto constant_input_should_be_inside_body(const std::shared_ptr<ov::Node>& node) -> bool;
 
+    static bool check_broadcast(const std::shared_ptr<const ov::Node>& node) noexcept;
+
 private:
     void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
     void convert_to_snippet_dialect();
@@ -164,8 +172,6 @@ class Subgraph : public ov::op::util::SubGraphOp {
     public:
         // True if Subgraph contains FakeQuantize -> FQ decomposition should be called
         bool m_is_quantized = false;
-        // True if we should align element types indise body
-        bool m_is_needed_to_align_precision = false;
         // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
         // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
         bool m_has_type_relaxed_ops = false;

diff --git a/src/common/snippets/include/snippets/pass/align_element_type.hpp b/src/common/snippets/include/snippets/pass/align_element_type.hpp
diff --git a/src/common/snippets/include/snippets/pass/fq_decomposition.hpp b/src/common/snippets/include/snippets/pass/fq_decomposition.hpp
@@ -29,15 +29,15 @@ namespace pass {
  *
  * Expand brackets:
  *   round(x * (levels-1) / (ih - il) - il * (levels-1) / (ih - il)) * (oh - ol) / (levels-1) + ol
- * 
+ *
  * Marking:
  *   - isc := (levels-1) / (ih - il)
  *   - ish := -il * isc
  *   - osc := (oh - ol) / (levels-1)
  *   - osh := ol
  * Final expression:
  *   round(x * isc + ish) * osc + osh
- * 
+ *
  * Some optimizations (example for scalars):
  * 1. If output element type of FQ is U8 and il = 0, ish = 0, osc = 1, osh = 0, there is enough expression: x * isc
  * 2. If output element type of FQ is I8 and ish ~= 128, osc = 1, osh ~= -128, il * isc ~= -128, ih * isc ~= 127 there is enough expression: x * isc
@@ -54,7 +54,6 @@ class FakeQuantizeDecomposition : public ngraph::pass::MatcherPass {
 public:
     FakeQuantizeDecomposition();
 
-    static bool isAllScalarConstant(const std::shared_ptr<const ngraph::Node>& node);
     static bool getScalesAndShifts(const std::shared_ptr<const ngraph::op::v0::FakeQuantize>& fq_node,
                                    std::vector<float>& cl,
                                    std::vector<float>& ch,

diff --git a/src/common/snippets/include/snippets/pass/propagate_precision.hpp b/src/common/snippets/include/snippets/pass/propagate_precision.hpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <ngraph/pass/pass.hpp>
+#include "snippets/generator.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @class PropagatePrecision
+ * @ingroup snippets
+ * @brief PropagatePrecision transformation propagate precision from parameters to results.
+ */
+class PropagatePrecision: public ngraph::pass::FunctionPass {
+public:
+    OPENVINO_RTTI("PropagatePrecision", "0");
+    PropagatePrecision(const std::shared_ptr<const TargetMachine>& target_machine);
+    bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
+
+    static std::vector<element::Type> get_precisions(
+        const std::vector<element::Type>& input_precisions,
+        const std::set<std::vector<element::Type>>& supported_precisions) noexcept;
+
+    // if can_be_removed returns true then actual convertion (actual_before => actual_after)
+    // can be replaced to required (actual_before => required_after)
+    static bool can_be_removed(
+        const element::Type& actual_before,
+        const element::Type& actual_after,
+        const element::Type& required_after) noexcept;
+
+    // if can_be_fused returns true then actual convertion can be replaced to required
+    static bool can_be_fused(
+        const element::Type& actual,
+        const element::Type& required) noexcept;
+
+private:
+    const std::shared_ptr<const TargetMachine> target_machine;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -11,14 +11,14 @@
 #include "snippets/pass/insert_movebroadcast.hpp"
 #include "snippets/pass/broadcast_to_movebroadcast.hpp"
 #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
+#include "snippets/pass/propagate_precision.hpp"
 #include "snippets/pass/assign_registers.hpp"
 #include "snippets/pass/convert_constants.hpp"
 #include "snippets/pass/convert_power_to_powerstatic.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
 #include "snippets/pass/insert_loops.hpp"
 #include "snippets/pass/transpose_decomposition.hpp"
 #include "snippets/pass/transform_convert.hpp"
-#include "snippets/pass/align_element_type.hpp"
 #include "snippets/pass/matmul_to_brgemm.hpp"
 #include "snippets/pass/fuse_transpose_brgemm.hpp"
 #include "snippets/pass/softmax_decomposition.hpp"
@@ -62,10 +62,6 @@ void snippets::op::Subgraph::init_config() {
             ov::is_type<ov::op::v0::FakeQuantize>(op);
         config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops ||
             std::dynamic_pointer_cast<ov::op::TypeRelaxedBase>(op);
-        config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision ||
-            is_quantized() ||
-            has_type_relaxed_ops() ||
-            snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
         config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops ||
             ov::is_type<ov::op::v1::Transpose>(op) ||
             ov::is_type<ov::op::v1::Softmax>(op) ||
@@ -359,6 +355,14 @@ ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector&
     return master_shape;
 }
 
+bool snippets::op::Subgraph::check_broadcast(const std::shared_ptr<const ov::Node>& node) noexcept {
+    const auto elementwise = std::dynamic_pointer_cast<const ov::op::util::BinaryElementwiseArithmetic>(node);
+    return
+        (elementwise == nullptr) ||
+        (elementwise->get_input_partial_shape(0).size() == elementwise->get_input_partial_shape(1).size()) ||
+        (elementwise->get_autob().m_type != ov::op::AutoBroadcastType::PDPD);
+}
+
 void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
                                                  const BlockedShapeVector& inputShapes) {
     // We should insert Convert before Results to set original output element type if needed
@@ -369,35 +373,34 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu
             const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
                 body_results[i]->get_input_node_shared_ptr(0), needed_out_type);
             body_results[i]->set_argument(0, convert);
+            body_results[i]->validate_and_infer_types();
         }
     }
 
     // We should change existing element type to original for Parameters if needed
-    const auto& body_parameters = body_ptr()->get_parameters();
+    const auto& parameters = body_ptr()->get_parameters();
     for (size_t i = 0; i < inputShapes.size(); ++i) {
         const auto needed_in_type = std::get<2>(inputShapes[i]);
-        if (body_parameters[i]->get_element_type() != needed_in_type) {
-            body_parameters[i]->set_element_type(needed_in_type);
-            config.m_is_needed_to_align_precision = true;
-        }
-    }
+        const auto& parameter = parameters[i];
+        if (parameter->get_element_type() != needed_in_type) {
+            const auto parameter_output = parameter->output(0);
+            const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
+                parameter_output,
+                parameter_output.get_element_type());
+            ngraph::copy_runtime_info(parameter, convert);
+
+            for (const auto input : parameter_output.get_target_inputs()) {
+                const auto& input_node = input.get_node();
+                if (input_node == convert.get()) {
+                    continue;
+                }
+                input_node->set_argument(input.get_index(), convert->output(0));
+            }
 
-    // We should align element type inside body using the corresponding pass:
-    //  - Insert Convert before operations that doesn't support original element type for execution
-    //  - Insert reverse Convert before operations that support original element type
-    //    but have inputs that doesn't support it (because before them will be inserted Convert with exec_type - first point)
-    //  - Then we should use ConstantFolding pass to convert element type of Scalars before inference.
-    //  - Eliminate redundant Converts which can be inserted in AlignElementType() pass
-    ngraph::pass::Manager manager;
-    if (config.m_is_needed_to_align_precision) {
-        manager.register_pass<snippets::pass::AlignElementType>(execution_element_type);
-        manager.register_pass<ov::pass::ConstantFolding>();
-        // TODO [100041] : In some cases AlignElementType pass can insert extra Convert because
-        //                 the pass doesn't know real precisions in real time.
-        //                 We call EliminateConverts pass to remove them
-        manager.register_pass<ov::pass::EliminateConvert>();
+            parameter->set_element_type(needed_in_type);
+            parameter->validate_and_infer_types();
+        }
     }
-    manager.run_passes(body_ptr());
 }
 
 void snippets::op::Subgraph::initialize_buffer_scratchpad_size() {
@@ -602,24 +605,39 @@ snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& ou
 
 snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
                                                     const BlockedShapeVector& input_shapes,
-                                                    ngraph::pass::Manager& opt,
+                                                    ngraph::pass::Manager& pre_dialect,
+                                                    ngraph::pass::Manager& post_dialect,
+                                                    ngraph::pass::Manager& post_precision,
                                                     const void* compile_params) {
     canonicalize(output_shapes, input_shapes);
-    return generate(opt, compile_params);
+    return generate(pre_dialect, post_dialect, post_precision, compile_params);
 }
 
 snippets::Schedule snippets::op::Subgraph::generate(const void* compile_params) {
     auto mngr = ngraph::pass::Manager();
-    return generate(mngr, compile_params);
+    return generate(mngr, mngr, mngr, compile_params);
 }
 
-snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const void* compile_params) {
+snippets::Schedule snippets::op::Subgraph::generate(
+    ngraph::pass::Manager& pre_dialect,
+    ngraph::pass::Manager& post_dialect,
+    ngraph::pass::Manager& post_precision,
+    const void* compile_params) {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
     NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
 
+    pre_dialect.run_passes(body_ptr());
     convert_to_snippet_dialect();
-    opt.run_passes(body_ptr());
+    post_dialect.run_passes(body_ptr());
+
+    ngraph::pass::Manager precision_manager;
+    precision_manager.register_pass<snippets::pass::PropagatePrecision>(m_generator->get_target_machine());
+    precision_manager.register_pass<ngraph::pass::ConstantFolding>();
+    precision_manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
+    precision_manager.run_passes(body_ptr());
+
+    post_precision.run_passes(body_ptr());
 
     // After all passes, when all optimizations are completed and all MemoryAccess ops are inserted,
     // we can calculate common buffer scratchpad size and propagate offset from Buffer to the corresponding MemoryAccess ops