Merge branch 'master' into xp/fix_zoom_regression_to_master

openvinotoolkit · Aug 16, 2024 · 36805ef · 36805ef
2 parents 8af13ec + 0b38dd7
commit 36805ef
Show file tree

Hide file tree

Showing 55 changed files with 1,440 additions and 291 deletions.
diff --git a/.github/workflows/fedora.yml b/.github/workflows/fedora.yml
@@ -221,6 +221,7 @@ jobs:
           if-no-files-found: 'error'
 
   RPM_Packages:
+    name: RPM packages
     needs: [Docker, Build]
     timeout-minutes: 10
     defaults:
@@ -273,6 +274,7 @@ jobs:
           python3 -c 'from openvino import Core; Core().get_property("BATCH", "SUPPORTED_PROPERTIES")'
           python3 -c 'from openvino.frontend import FrontEndManager; assert len(FrontEndManager().get_available_front_ends()) == 7'
           benchmark_app --help
+          opt_in_out --help
           ovc --help
 
   Overall_Status:

diff --git a/.github/workflows/job_build_linux.yml b/.github/workflows/job_build_linux.yml
@@ -113,13 +113,7 @@ jobs:
           python3 -m pip install -r ${OPENVINO_REPO}/src/bindings/python/wheel/requirements-dev.txt
 
           # For running ONNX frontend unit tests
-          if [[ ${{ inputs.os }} == 'ubuntu_24_04' ]]; then
-            # Should be removed after https://github.com/openvinotoolkit/openvino/pull/24242 is merged
-            export CMAKE_GENERATOR="Unix Makefiles"
-            python3 -m pip install --force-reinstall --no-cache-dir -r ${OPENVINO_REPO}/src/frontends/onnx/tests/requirements.txt
-          else
-            python3 -m pip install --force-reinstall -r ${OPENVINO_REPO}/src/frontends/onnx/tests/requirements.txt
-          fi
+          python3 -m pip install -r ${OPENVINO_REPO}/src/frontends/onnx/tests/requirements.txt
 
           # For running TensorFlow frontend unit tests
           python3 -m pip install -r ${OPENVINO_REPO}/src/frontends/tensorflow/tests/requirements.txt

diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml
@@ -102,7 +102,7 @@ jobs:
       BUILD_DIR: /__w/openvino/openvino/openvino_build
       SELECTIVE_BUILD_STAT_DIR: /__w/openvino/openvino/selective_build_stat
       MODELS_PATH: /__w/openvino/openvino/testdata
-      SCCACHE_AZURE_KEY_PREFIX: ubuntu22_x86_64_itt_clang_Release
+      SCCACHE_AZURE_KEY_PREFIX: ubuntu22_x86_64_itt_clang_Release_faster_build
     if: ${{ !needs.smart_ci.outputs.skip_workflow && github.event_name != 'merge_group' }}
 
     steps:
@@ -157,6 +157,7 @@ jobs:
             -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
             -DENABLE_PROFILING_ITT=ON \
             -DSELECTIVE_BUILD=COLLECT \
+            -DENABLE_FASTER_BUILD=ON \
             -DENABLE_DEBUG_CAPS=ON \
             -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \

diff --git a/cmake/developer_package/packaging/archive.cmake b/cmake/developer_package/packaging/archive.cmake
@@ -7,10 +7,14 @@ include(GNUInstallDirs)
 if(APPLE)
     # on macOS versions with SIP enabled, we need to use @rpath
     # because DYLD_LIBRARY_PATH is ignored
-    set(CMAKE_SKIP_INSTALL_RPATH OFF)
+    set(CMAKE_SKIP_INSTALL_RPATH_DEFAULT OFF)
 else()
     # we don't need RPATHs, because setupvars.sh is used
-    set(CMAKE_SKIP_INSTALL_RPATH ON)
+    set(CMAKE_SKIP_INSTALL_RPATH_DEFAULT ON)
+endif()
+
+if(NOT DEFINED CMAKE_SKIP_INSTALL_RPATH)
+    set(CMAKE_SKIP_INSTALL_RPATH ${CMAKE_SKIP_INSTALL_RPATH_DEFAULT})
 endif()
 
 #

diff --git a/cmake/features.cmake b/cmake/features.cmake
@@ -194,9 +194,6 @@ ov_dependent_option (ENABLE_SYSTEM_PROTOBUF "Enables use of system Protobuf" OFF
 ov_dependent_option (ENABLE_SYSTEM_SNAPPY "Enables use of system version of Snappy" OFF
     "ENABLE_SNAPPY_COMPRESSION" OFF)
 
-ov_dependent_option (ENABLE_PYTHON_PACKAGING "Enables packaging of Python API in APT / YUM" OFF
-    "ENABLE_PYTHON;UNIX" OFF)
-
 ov_dependent_option(ENABLE_JS "Enables JS API building" ${ENABLE_JS_DEFAULT} "NOT ANDROID;NOT EMSCRIPTEN" OFF)
 
 ov_option(ENABLE_OPENVINO_DEBUG "Enable output for OPENVINO_DEBUG statements" OFF)

diff --git a/cmake/packaging/rpm.cmake b/cmake/packaging/rpm.cmake
@@ -274,9 +274,6 @@ macro(ov_cpack_settings)
     ov_rpm_generate_conflicts("${OV_CPACK_COMP_CORE_DEV}" ${conflicting_versions})
 
     ov_rpm_add_rpmlint_suppression("${OV_CPACK_COMP_CORE_DEV}"
-        # contains samples source codes
-        "devel-file-in-non-devel-package /usr/${OV_CPACK_INCLUDEDIR}/ngraph"
-        "devel-file-in-non-devel-package /usr/${OV_CPACK_INCLUDEDIR}/ie"
         "devel-file-in-non-devel-package /usr/${OV_CPACK_INCLUDEDIR}/openvino"
         "devel-file-in-non-devel-package /usr/${OV_CPACK_RUNTIMEDIR}/libopenvino*"
         "devel-file-in-non-devel-package /usr/${OV_CPACK_RUNTIMEDIR}/pkgconfig/openvino.pc")
@@ -302,8 +299,12 @@ macro(ov_cpack_settings)
         ov_rpm_generate_conflicts(${python_component} ${conflicting_versions})
 
         ov_rpm_add_rpmlint_suppression("${python_component}"
+            # entry points
+            "no-manual-page-for-binary benchmark_app"
+            "no-manual-page-for-binary opt_in_out"
+            "no-manual-page-for-binary ovc"
             # all directories
-            "non-standard-dir-perm /usr/lib64/${pyversion}/site-packages/openvino/*"
+            "non-standard-dir-perm /usr/lib/${pyversion}/site-packages/openvino/*"
             )
     endif()
 
@@ -383,7 +384,7 @@ macro(ov_cpack_settings)
     set(CPACK_COMPONENT_OPENVINO_DESCRIPTION "Intel(R) Distribution of OpenVINO(TM) Toolkit Libraries and Development files")
     set(CPACK_RPM_OPENVINO_PACKAGE_REQUIRES "${libraries_dev_package}, ${samples_package}")
     if(ENABLE_PYTHON_PACKAGING)
-        set(CPACK_DEBIAN_OPENVINO_PACKAGE_DEPENDS "${CPACK_RPM_OPENVINO_PACKAGE_REQUIRES}, ${python_package}, ${python_samples_package}")
+        set(CPACK_RPM_OPENVINO_PACKAGE_REQUIRES "${CPACK_RPM_OPENVINO_PACKAGE_REQUIRES}, ${python_package}, ${python_samples_package}")
     endif()
     set(CPACK_RPM_OPENVINO_PACKAGE_NAME "openvino-${cpack_name_ver}")
     set(CPACK_RPM_OPENVINO_PACKAGE_ARCHITECTURE "noarch")

diff --git a/src/bindings/python/CMakeLists.txt b/src/bindings/python/CMakeLists.txt
@@ -116,6 +116,9 @@ ov_check_init_files_alignment("${INIT_FILES_RUNTIME}")
 
 ov_option(ENABLE_PYTHON "Enables OpenVINO Python API build" ${ENABLE_PYTHON_DEFAULT})
 
+ov_dependent_option (ENABLE_PYTHON_PACKAGING "Enables packaging of Python API in APT / YUM" OFF
+    "ENABLE_PYTHON;LINUX" OFF)
+
 #
 # Check for wheel package
 #
@@ -366,7 +369,7 @@ if(ENABLE_PYTHON_PACKAGING)
     ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_OPENVINO}_package_${pyversion} HIDDEN)
 
     install(DIRECTORY ${ov_python_package_prefix}/ ${telemetry_python_package_prefix}/
-            DESTINATION ${CMAKE_INSTALL_PREFIX}
+            DESTINATION .
             COMPONENT ${OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE}_${pyversion}
             ${OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL}
             USE_SOURCE_PERMISSIONS)

diff --git a/src/bindings/python/src/pyopenvino/graph/symbol.cpp b/src/bindings/python/src/pyopenvino/graph/symbol.cpp
@@ -25,6 +25,20 @@ void regclass_graph_Symbol(py::module m) {
         },
         py::is_operator());
 
+    symbol.def(
+        "__add__",
+        [](const std::shared_ptr<ov::Symbol>& a, const std::shared_ptr<ov::Symbol>& b) {
+            return a + b;
+        },
+        py::is_operator());
+
+    symbol.def(
+        "__sub__",
+        [](const std::shared_ptr<ov::Symbol>& a, const std::shared_ptr<ov::Symbol>& b) {
+            return a - b;
+        },
+        py::is_operator());
+
     symbol.def(
         "__bool__",
         [](const std::shared_ptr<ov::Symbol>& self) -> bool {

diff --git a/src/bindings/python/tests/test_runtime/test_dimension.py b/src/bindings/python/tests/test_runtime/test_dimension.py
@@ -80,6 +80,15 @@ def test_symbol():
     assert dimension.get_symbol() == new_dimension.get_symbol(), "Check: Two symbols are equal: Symbol.__eq__"
 
 
+def test_symbol_operators():
+    symbol_a, symbol_b = Symbol(), Symbol()
+    assert symbol_a + symbol_b == symbol_b + symbol_a
+
+    symbol_c, symbol_d = Symbol(), Symbol()
+    assert symbol_c + symbol_d - symbol_d == symbol_c
+    assert symbol_c + symbol_d - symbol_c == symbol_d
+
+
 def test_symbol_hash():
     symbol = Symbol()
     assert isinstance(hash(symbol), int)

diff --git a/src/common/snippets/include/snippets/lowered/pass/init_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/init_loops.hpp
@@ -24,7 +24,17 @@ class InitLoops : public Pass {
     InitLoops() = default;
     bool run(LinearIR& linear_ir) override;
 
-    static void init_loop_info(const UnifiedLoopInfoPtr& loop_info, size_t loop_id, bool only_runtime_args = false);
+    /**
+     * @brief Updates ptr_increments and finalization offsets of the provided "loop_info" based on current work amount
+     */
+    static void update_data_pointer_shifts(const UnifiedLoopInfoPtr& loop_info);
+    /**
+     * @brief Updates work amount and updates data pointer shifts of the provided "loop_info"
+     */
+    static void update_runtime_parameters(const UnifiedLoopInfoPtr& loop_info);
+
+private:
+    static void update_compile_parameters(const UnifiedLoopInfoPtr& loop_info, size_t loop_id);
 };
 
 } // namespace pass

diff --git a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
@@ -14,6 +14,9 @@ namespace pass {
  * @interface SplitDimensionM
  * @brief Inserts Reshape nodes before inputs and after outputs of Subgraphs with MatMul inside
  *        to split dimension M for MatMuls. It allows to increase work amount for parallelism
+ * @attention This pass works only for MHA with static shapes.
+ * For dynamic shapes, parallel work amount is optimized in RuntimeConfigurator.
+ * @todo Ticket 148805: Move static cases handling in RuntimeConfigurator as well.
  * @ingroup snippets
  */
 class SplitDimensionM: public CommonOptimizations::SubgraphPass {
@@ -28,17 +31,48 @@ class SplitDimensionM: public CommonOptimizations::SubgraphPass {
     // Returns True if parallelism work amount (concurrency) can be increased by this optimization
     static bool can_be_optimized(const std::shared_ptr<const ov::Node>& node, size_t concurrency);
 
+    /**
+     * @brief Tries to split M dimension in "shape" in accordance to optimal parallel work amount
+     * @param shape Original shape
+     * @param optimal_parallelism_work_amount Optimal work amount
+     * @param batch_m_dim reference on batch's part of the split M
+     * @param new_m_dim reference on new M dim after the split
+     * @return true if split was successfull, otherwise false
+     */
+    static bool split(const ov::Shape& shape, size_t optimal_parallelism_work_amount, size_t& batch_m_dim, size_t& new_m_dim);
+
+    /**
+     * @brief Splits m dimension in order
+     * @param order Original order
+     * @param m_index M dimension index
+     * @return updated order with the split M dimension
+     */
+    static std::vector<size_t> get_updated_order(const std::vector<size_t>& order, size_t m_index);
+    /**
+     * @brief Reshapes m dimension in "shape": separates M in two parts: "batch_m_dim" and "new_m_dim"
+     * @param shape Shape to split
+     * @param m_index M dimension index
+     * @param batch_m_dim batch's part of the split M
+     * @param new_m_dim new M dim after the split
+     * @return the updated shape
+     */
+    static ov::snippets::VectorDims reshape_m_dim(ov::snippets::VectorDims shape, size_t m_index, size_t batch_m_dim, size_t new_m_dim);
+    /**
+     * @brief Unsqueezes m dimension in "shape" (inserts "1" before the dimension)
+     * @param shape Shape to split
+     * @param m_index M dimension index
+     * @return the updated shape
+     */
+    static ov::snippets::VectorDims unsqueeze_m_dim(ov::snippets::VectorDims shape, size_t m_index);
+
 private:
     static std::shared_ptr<ov::op::v0::MatMul> get_matmul(const std::shared_ptr<op::Subgraph>& subgraph);
     static std::pair<size_t, size_t> get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
-    static bool split(const ov::Shape& shape, size_t optimal_parallelism_work_amount, size_t& batch_m_dim, size_t& new_m_dim);
 
     void reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim);
 
     size_t m_concurrency;
 };
-
-
 } // namespace pass
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_info.hpp"
 #include "snippets/kernel_executor_table.hpp"
 #include "snippets/lowered/pass/pass.hpp"
 
@@ -82,8 +83,14 @@ class RuntimeConfigurator {
     /**
      * @brief Update RuntimeConfig based on LinearIR
      * @param linear_ir LinearIR
+     * @todo Ticket 148891: Rewrite on PassPipeline
      */
     virtual void update(const lowered::LinearIRCPtr& linear_ir);
+    /**
+     * @brief Update tensor rank based on master shape
+     * @param master_shape Master shape
+     */
+    virtual void update_tensor_rank(const ov::snippets::VectorDims& master_shape);
     /**
      * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
      * @param linear_ir LinearIR
@@ -109,11 +116,21 @@ class RuntimeConfigurator {
      * @param linear_ir LinearIR
      */
     virtual void init_tensor_rank(const lowered::LinearIRCPtr& linear_ir) const;
+
+    struct UnifiedLoopInfoRtParams {
+        size_t work_amount = 0;
+        std::vector<int64_t> ptr_increments;
+        std::vector<int64_t> finalization_offsets;
+    };
+    static UnifiedLoopInfoRtParams compute_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info);
+    using LoopInfoRuntimeParamsMap = std::unordered_map<lowered::UnifiedLoopInfoPtr, UnifiedLoopInfoRtParams>;
     /**
      * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo
      * @param linear_ir LinearIR
+     * @param initializated_info_map Reference on a map [LoopInfo->RuntimeParams].
+     * Can be used to pass in the method loop infos which were already initialized, e.g. by parallel domain optimization
      */
-    void update_loop_info(const lowered::LinearIRCPtr& linear_ir) const;
+    void update_loop_info(const lowered::LinearIRCPtr& linear_ir, LoopInfoRuntimeParamsMap& initializated_info_map) const;
     /**
      * @brief Update Buffer scratchpad size and offsets if needed
      *        Note: `update_loop_info` must be called before
@@ -122,12 +139,73 @@ class RuntimeConfigurator {
     void update_buffer_scratchpad_size(const lowered::LinearIRCPtr& linear_ir) const;
     /**
      * @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig
+     * @param shapes shapes used in offsets computation
+     * @param layouts layouts used in offsets computation
+     */
+    void update_data_offsets(const std::vector<ov::snippets::VectorDims>& shapes,
+                             const std::vector<std::vector<size_t>>& layouts) const;
+    /**
+     * @brief Extract shapes from m_io_descs
      */
-    void update_data_offsets() const;
+    std::vector<ov::snippets::VectorDims> extract_shapes() const;
     /**
-     * @brief Update latest input shapes
+     * @brief Extract layouts from m_io_descs
      */
-    void update_latest_shapes();
+    std::vector<std::vector<size_t>> extract_layouts() const;
+
+    class ParallelWAOptimizer {
+    public:
+        /**
+         * @brief Inits ParallelWAOptimizer: computes optimizer parameters which should be set at compilation stage
+         * @param linear_ir LinearIR
+         * @param io_descs Input/output descriptors which are used for optimizer parameters initialization
+         * @param in_num Number of inputs. It is needed to distinguish input and output shapes/layouts
+         */
+        void init(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
+                  const std::vector<snippets::lowered::PortDescriptorPtr>& io_descs,
+                  size_t in_num);
+        /**
+         * @brief Checks if optimizer is enabled
+         * @todo Ticket 148891: when RuntimeConfigurator::update will be rewritten on PassPipeline, this method should be removed
+         * We will not just register ParallelWAOptimizer in case if it is not needed
+         */
+        bool enabled();
+        /**
+         * @brief Checks if the current master shape can be optimized, and if yes, updates all the necessary runtime information
+         * @param master_shape Master shape
+         * @param map Loop info -> Runtime params map which will be passed in "update_loop_info"
+         * the map is filled with updated loops_to_split loops: "new_m" work amount is set for them, and runtime params are updated correspondingly
+         * @param shapes Vector which is filled with the split shapes
+         * @param layouts Vector which is filled with the split layouts
+         * @param in_num Number of inputs. It is needed to distinguish input and output shapes/layouts
+         * @return status if the optimization is applied
+         */
+        void optimize(ov::snippets::VectorDims& master_shape,
+                      ov::snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap& map,
+                      std::vector<ov::snippets::VectorDims>& shapes,
+                      std::vector<std::vector<size_t>>& layouts,
+                      size_t in_num);
+
+    private:
+        void update_master_shape(ov::snippets::VectorDims& master_shape, size_t new_batch_dim, size_t new_kernel_dim);
+        void update_split_loops_info(ov::snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap& map, size_t new_kernel_dim);
+        void update_shapes(std::vector<ov::snippets::VectorDims>& shapes, size_t new_batch_dim, size_t new_kernel_dim);
+        void update_layouts(std::vector<std::vector<size_t>>& layouts);
+
+        static std::unordered_set<snippets::lowered::ExpressionPtr> find_applicable_brgemms(const ov::snippets::lowered::LinearIRCPtr& linear_ir);
+        static std::unordered_set<size_t> find_unsqueezed_params(
+            const ov::snippets::lowered::LinearIRCPtr& linear_ir,
+            const std::unordered_set<snippets::lowered::ExpressionPtr>& brgemms);
+        static std::unordered_set<ov::snippets::lowered::UnifiedLoopInfoPtr> find_loops_to_split(
+            const ov::snippets::lowered::LinearIRCPtr& linear_ir,
+            const std::unordered_set<size_t>& unsqueezed_params);
+
+        std::unordered_set<ov::snippets::lowered::UnifiedLoopInfoPtr> loops_to_split{};
+        std::unordered_set<size_t> unsqueezed_params{};
+        std::vector<std::vector<size_t>> optimized_layouts{};
+        std::vector<size_t> m_dim_idces{};
+        size_t concurrency = 0;
+    } m_optimizer;
 
     std::shared_ptr<RuntimeConfig> m_config = nullptr;
 

diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp
@@ -275,6 +275,19 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const st
 
 int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1);
 
+/**
+ * @brief Traverses path starting from "expr", and calls "func" for each expression.
+ * Traversal direction is defined by "visit_parent_path"
+ * @param expr The expr from which path is started.
+ * @param visited Set of expressions which were visited.
+ * @param func The function which is called for each visited node.
+ * @param visit_parent_path if true, parent nodes are visited. Otherwise, consumers are visited.
+ */
+void visit_path(const lowered::ExpressionPtr& expr,
+                std::unordered_set<lowered::ExpressionPtr>& visited,
+                std::function<void(lowered::ExpressionPtr)> func,
+                bool visit_parent_path);
+
 } // namespace utils
 } // namespace snippets
 } // namespace ov