Merge branch 'master' into river/device_name_policy

riverlijunjie · Mar 11, 2024 · 78aa44a · 78aa44a
2 parents f1bd8c1 + 5b7d7c8
commit 78aa44a
Show file tree

Hide file tree

Showing 66 changed files with 1,181 additions and 488 deletions.
diff --git a/docs/articles_en/learn-openvino/large-language-models/llm-inference-hf.rst b/docs/articles_en/learn-openvino/large-language-models/llm-inference-hf.rst
@@ -81,23 +81,19 @@ In this case, you can load the converted model in OpenVINO representation direct
     model = OVModelForCausalLM.from_pretrained(model_id)
 
 
-By default, inference will run on CPU. To select a different inference device, for example, GPU,
-add ``device="GPU"`` to the ``from_pretrained()`` call. To switch to a different device after
-the model has been loaded, use the ``.to()`` method. The device naming convention is the same
-as in OpenVINO native API:
-
-.. code-block:: python
-
-    model.to("GPU")
-
-
 Optimum-Intel API also provides out-of-the-box model optimization through weight compression
 using NNCF which substantially reduces the model footprint and inference latency:
 
 .. code-block:: python
 
     model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=True)
 
+    # or if model was already converted
+    model = OVModelForCausalLM.from_pretrained(model_path, load_in_8bit=True)
+
+    # save model after optimization
+    model.save_pretrained(optimized_model_path)
+
 
 Weight compression is applied by default to models larger than one billion parameters and is
 also available for CLI interface as the ``--int8`` option.
@@ -121,6 +117,15 @@ compression with ``OVWeightQuantizationConfig`` class to control weight quantiza
         quantization_config=OVWeightQuantizationConfig(bits=4, asym=True, ratio=0.8, dataset="ptb"),
     )
 
+    # or if model was already converted
+    mmodel = OVModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=OVWeightQuantizationConfig(bits=4, asym=True, ratio=0.8, dataset="ptb"),
+    )
+
+    # save model after optimization
+    model.save_pretrained(optimized_model_path)
+
 
 The optimized model can be saved as usual with a call to ``save_pretrained()``.
 For more details on compression options, refer to the :doc:`weight compression guide <weight_compression>`.
@@ -168,13 +173,14 @@ an inference pipeline. This setup allows for easy text processing and model inte
   Converting LLMs on the fly every time to OpenVINO IR is a resource intensive task.
   It is a good practice to convert the model once, save it in a folder and load it for inference.
 
-By default, inference will run on CPU. To switch to a different device, the ``device`` attribute
-from the ``from_pretrained`` function can be used. The device naming convention is the
-same as in OpenVINO native API:
+By default, inference will run on CPU. To select a different inference device, for example, GPU,
+add ``device="GPU"`` to the ``from_pretrained()`` call. To switch to a different device after
+the model has been loaded, use the ``.to()`` method. The device naming convention is the same
+as in OpenVINO native API:
 
 .. code-block:: python
 
-  model = OVModelForCausalLM.from_pretrained(model_id, export=True, device="GPU")
+    model.to("GPU")
 
 Enabling OpenVINO Runtime Optimizations
 ############################################################

diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
@@ -345,8 +345,6 @@ class LinearIR::LoopManager {
     /**
      * @brief When the previous expression was replaced with new expressions (decomposition), the method updates the corresponding Loop.
      *        If ports of decomposed expression were the Loop ports, these Loop ports may be updated by parameters `entries` and `exits`
-     *        Note: This method should be removed when Softmax decomposition will be moved on data flow pipeline since
-     *              all decompositions should be call on this pipeline
      * @param new_expr_begin the first expression iterator
      * @param new_expr_end the next iterator after the last expression
      * @param decomposed_expr the expression that is decomposed into several other exprs

diff --git a/src/common/snippets/include/snippets/lowered/pass/reduce_decomposition.hpp b/src/common/snippets/include/snippets/lowered/pass/reduce_decomposition.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface ReduceDecomposition
+ * @brief Decomposes snippets::Reduce operations to a range of low-level operations on linear IR
+ * @attention Only Reduce by last dimension is supported
+ * @ingroup snippets
+ */
+class ReduceDecomposition : public RangedPass {
+public:
+    OPENVINO_RTTI("ReduceDecomposition", "RangedPass")
+    explicit ReduceDecomposition(size_t vector_size);
+    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_vector_size = 0;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/lowered/pass/softmax_decomposition.hpp
diff --git a/src/common/snippets/include/snippets/op/reduce.hpp b/src/common/snippets/include/snippets/op/reduce.hpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "snippets/shape_inference/shape_infer_instances.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface ReduceBase
+ * @brief Base class for reduce operations.
+ * @param m_axis reduce axis.
+ * @ingroup snippets
+ */
+class ReduceBase : public ov::op::Op {
+public:
+    OPENVINO_OP("ReduceBase", "SnippetsOpset");
+
+    ReduceBase(const Output<Node>& x, size_t axis);
+    ReduceBase() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    size_t get_axis() const { return m_axis; }
+    static void compute_and_set_reduce_subtensors(const std::shared_ptr<ReduceBase>& reduce);
+
+protected:
+    size_t m_axis = 0;
+};
+
+class ReduceSum : public ReduceBase {
+public:
+    OPENVINO_OP("ReduceSum", "SnippetsOpset", ReduceBase);
+    ReduceSum(const Output<Node>& x, size_t axis) : ReduceBase(x, axis) {}
+    ReduceSum() = default;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+};
+
+class ReduceMax : public ReduceBase {
+public:
+    OPENVINO_OP("ReduceMax", "SnippetsOpset", ReduceBase);
+    ReduceMax(const Output<Node>& x, size_t axis) : ReduceBase(x, axis) {}
+    ReduceMax() = default;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/...clude/snippets/pass/set_softmax_ports.hpp → ...ippets/pass/reduce_to_snippets_reduce.hpp b/...clude/snippets/pass/set_softmax_ports.hpp → ...ippets/pass/reduce_to_snippets_reduce.hpp
@@ -4,23 +4,25 @@
 
 #pragma once
 
-#include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
+#include "openvino/pass/graph_rewrite.hpp"
 
 namespace ov {
 namespace snippets {
 namespace pass {
 
 /**
- * @interface SetSoftmaxPorts
- * @brief The pass updates port descriptors in accordance with the Softmax reduction axis
+ * @interface ReduceToSnippetsReduce
+ * @brief Converts ReduceMax snd ReduceSum from openvino opset to snippets opset.
+ * Also checks that reduction operation is supported by snippets.
  * @ingroup snippets
  */
-class SetSoftmaxPorts: public ov::pass::MatcherPass {
+class ReduceToSnippetsReduce: public ov::pass::MatcherPass {
 public:
-    SetSoftmaxPorts();
+    ReduceToSnippetsReduce();
 };
 
+
 } // namespace pass
 } // namespace snippets
-} // namespace ov
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+#include "openvino/pass/pattern/matcher.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface SoftmaxDecomposition
+ * @brief Decomposes Softmax to a range of low-level operations
+ * @ingroup snippets
+ */
+class SoftmaxDecomposition: public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("SoftmaxDecomposition", "0");
+    SoftmaxDecomposition();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
@@ -68,5 +68,12 @@ class BrgemmShapeInfer : public IShapeInferSnippets {
     Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
 };
 
+class ReduceShapeInfer : public IShapeInferSnippets {
+    size_t m_axis;
+public:
+    explicit ReduceShapeInfer(const std::shared_ptr<Node>& n);
+    Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+};
+
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -26,6 +26,7 @@
 #include "op/vector_buffer.hpp"
 #include "op/rank_normalization.hpp"
 #include "op/perf_count.hpp"
+#include "op/reduce.hpp"
 
 namespace ov {
 namespace snippets {

diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -23,6 +23,8 @@ OV_OP(BroadcastMove, ov::snippets::op)
 OV_OP(Scalar, ov::snippets::op)
 OV_OP(Nop, ov::snippets::op)
 OV_OP(RankNormalization, ov::snippets::op)
+OV_OP(ReduceMax, ov::snippets::op)
+OV_OP(ReduceSum, ov::snippets::op)
 
 #ifdef SNIPPETS_DEBUG_CAPS
 OV_OP(PerfCountBegin, ov::snippets::op)

diff --git a/src/common/snippets/include/snippets/target_machine.hpp b/src/common/snippets/include/snippets/target_machine.hpp
@@ -50,7 +50,6 @@ class TargetMachine {
      */
     virtual size_t get_lanes() const = 0;
 
-
     /**
      * @brief called by generator to all the emitter for a target machine
      * @return a map by node's type info with callbacks to create an instance of emitter for corresponding operation type

diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp
@@ -243,8 +243,10 @@ void LinearIR::debug_print(bool tds_as_pointers) const {
 
 void LinearIR::init_emitters(const std::shared_ptr<TargetMachine>& target) {
     for (auto& expr : m_expressions) {
-        if (!expr->get_emitter())
+        if (!expr->get_emitter()) {
             expr->m_emitter = target->get(expr->get_node()->get_type_info())(expr);
+            OPENVINO_ASSERT(expr->m_emitter, "Emitter can't be created for the node ", expr->get_node());
+        }
     }
 }
 

diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp
@@ -101,7 +101,7 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
             manually_assigned_gprs[expr->get_output_port_connector(0)] =
                     static_cast<Reg>(num_results + num_parameters + buffer_id);
         } else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
-            // Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer.
+            // Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer.
             // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
             // TODO [96351]: We should rewrite accumulator pattern using another way
             const auto& input_tensor = expr->get_input_port_connector(0);
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,7 +50,6 @@ class TargetMachine { @@
          */
         virtual size_t get_lanes() const = 0;
         /**
          * @brief called by generator to all the emitter for a target machine
          * @return a map by node's type info with callbacks to create an instance of emitter for corresponding operation type
@@ Expand Down @@