Merge branch 'itikhono/refactoring/pass_manager' of https://github.co…

…m/itikhono/openvino into itikhono/refactoring/pass_manager
itikhono · Jul 25, 2024 · 3aa13ea · 3aa13ea
2 parents 2a8e9bf + 184dcb7
commit 3aa13ea
Show file tree

Hide file tree

Showing 9 changed files with 39 additions and 40 deletions.
diff --git a/docs/articles_en/assets/snippets/ov_caching.cpp b/docs/articles_en/assets/snippets/ov_caching.cpp
@@ -1,10 +1,10 @@
 #include <openvino/runtime/core.hpp>
 
+//! [ov:caching:part0]
 void part0() {
     std::string modelPath = "/tmp/myModel.xml";
-    std::string device = "GPU";
+    std::string device = "GPU";                             // For example: "CPU", "GPU", "NPU".
     ov::AnyMap config;
-//! [ov:caching:part0]
 ov::Core core;                                              // Step 1: create ov::Core object
 core.set_property(ov::cache_dir("/path/to/cache/dir"));     // Step 1b: Enable caching
 auto model = core.read_model(modelPath);                    // Step 2: Read Model

diff --git a/docs/articles_en/assets/snippets/ov_caching.py b/docs/articles_en/assets/snippets/ov_caching.py
@@ -8,6 +8,7 @@
 
 import openvino.properties as props
 
+# For example: "CPU", "GPU", "NPU".
 device_name = 'CPU'
 model_path = get_path_to_model()
 path_to_cache_dir = get_temp_dir()

diff --git a/docs/articles_en/openvino-workflow/model-preparation.rst b/docs/articles_en/openvino-workflow/model-preparation.rst
@@ -267,6 +267,7 @@ Before saving the model to OpenVINO IR, consider
 :doc:`Post-training Optimization <model-optimization-guide/quantizing-models-post-training>` to achieve more efficient inference and
 a smaller model.
 
+.. _convert_model_cli_ovc:
 
 Convert a Model in CLI: ``ovc``
 ###############################

diff --git a/...envino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst b/...envino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst
@@ -10,7 +10,6 @@ Optimize Preprocessing
 
    optimize-preprocessing/preprocessing-api-details
    optimize-preprocessing/layout-api-overview
-   optimize-preprocessing/integrate-save-preprocessing-use-case
    Torchvision preprocessing converter <optimize-preprocessing/torchvision-preprocessing-converter>
 
 .. meta::

diff --git a/...ference/optimize-inference/optimize-preprocessing/preprocessing-api-details.rst b/...ference/optimize-inference/optimize-preprocessing/preprocessing-api-details.rst
@@ -3,6 +3,11 @@
 Preprocessing API - details
 ===========================
 
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   preprocessing-api-details/integrate-save-preprocessing-use-case
 
 .. meta::
    :description: Learn the details on capabilities of pre-processing API and post-processing.

diff --git a/...integrate-save-preprocessing-use-case.rst → ...integrate-save-preprocessing-use-case.rst b/...integrate-save-preprocessing-use-case.rst → ...integrate-save-preprocessing-use-case.rst
@@ -10,8 +10,8 @@ Use Case - Integrate and Save Preprocessing Steps Into IR
                  OpenVINO Intermediate Representation.
 
 
-Previous sections covered the topic of the :doc:`preprocessing steps <preprocessing-api-details>`
-and the overview of :doc:`Layout <layout-api-overview>` API.
+Previous sections covered the :doc:`preprocessing steps <../preprocessing-api-details>`
+and the overview of :doc:`Layout API <../layout-api-overview>`.
 
 For many applications, it is also important to minimize read/load time of a model.
 Therefore, performing integration of preprocessing steps every time on application
@@ -20,25 +20,18 @@ once pre and postprocessing steps have been added, it can be useful to store new
 model to OpenVINO Intermediate Representation (OpenVINO IR, `.xml` format).
 
 Most available preprocessing steps can also be performed via command-line options,
-using Model Optimizer. For details on such command-line options, refer to the
-:doc:`Optimizing Preprocessing Computation <../../../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation>`.
+using ``ovc``. For details on such command-line options, refer to the
+:ref:`Model Conversion <convert_model_cli_ovc>`.
 
 Code example - Saving Model with Preprocessing to OpenVINO IR
 #############################################################
 
-When some preprocessing steps cannot be integrated into the execution graph using
-Model Optimizer command-line options (for example, ``YUV``->``RGB`` color space conversion,
-``Resize``, etc.), it is possible to write a simple code which:
+In the following example:
 
-* Reads the original model (OpenVINO IR, TensorFlow, TensorFlow Lite, ONNX, PaddlePaddle).
-* Adds the preprocessing/postprocessing steps.
-* Saves resulting model as IR (``.xml`` and ``.bin``).
+* Original ONNX model takes one ``float32`` input with the ``{1, 3, 224, 224}`` shape, the ``RGB`` channel order, and mean/scale values applied.
+* Application provides ``BGR`` image buffer with a non-fixed size and input images as batches of two.
 
-Consider the example, where an original ONNX model takes one ``float32`` input with the
-``{1, 3, 224, 224}`` shape, the ``RGB`` channel order, and mean/scale values applied.
-In contrast, the application provides ``BGR`` image buffer with a non-fixed size and
-input images as batches of two. Below is the model conversion code that can be applied
-in the model preparation script for such a case.
+Below is the model conversion code that can be applied in the model preparation script for this case:
 
 * Includes / Imports
 
@@ -62,7 +55,6 @@ in the model preparation script for such a case.
 
 * Preprocessing & Saving to the OpenVINO IR code.
 
-
 .. tab-set::
 
    .. tab-item:: Python
@@ -83,8 +75,8 @@ in the model preparation script for such a case.
 Application Code - Load Model to Target Device
 ##############################################
 
-After this, the application code can load a saved file and stop preprocessing. In this case, enable
-:doc:`model caching <../optimizing-latency/model-caching-overview>` to minimize load
+Next, the application code can load a saved file and stop preprocessing. In this case, enable
+:doc:`model caching <../../optimizing-latency/model-caching-overview>` to minimize load
 time when the cached model is available.
 
 
@@ -108,10 +100,10 @@ time when the cached model is available.
 Additional Resources
 ####################
 
-* :doc:`Preprocessing Details <preprocessing-api-details>`
-* :doc:`Layout API overview <layout-api-overview>`
-* :doc:`Model Optimizer - Optimize Preprocessing Computation <../../../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation>`
-* :doc:`Model Caching Overview <../optimizing-latency/model-caching-overview>`
+* :doc:`Preprocessing Details <../preprocessing-api-details>`
+* :doc:`Layout API overview <../layout-api-overview>`
+* :doc:`Model Caching Overview <../../optimizing-latency/model-caching-overview>`
+* :doc:`Model Preparation <../../../../model-preparation>`
 * The `ov::preprocess::PrePostProcessor <https://docs.openvino.ai/2024/api/c_cpp_api/classov_1_1preprocess_1_1_pre_post_processor.html>`__ C++ class documentation
 * The `ov::pass::Serialize <https://docs.openvino.ai/2024/api/c_cpp_api/classov_1_1pass_1_1_serialize.html>`__ - pass to serialize model to XML/BIN
 * The ``ov::set_batch`` - update batch dimension for a given model

diff --git a/...ning-inference/optimize-inference/optimizing-latency/model-caching-overview.rst b/...ning-inference/optimize-inference/optimizing-latency/model-caching-overview.rst
@@ -61,7 +61,8 @@ To enable model caching, the application must specify a folder to store the cach
 
 
 With this code, if the device specified by ``device_name`` supports import/export model capability,
-a cached blob is automatically created inside the ``/path/to/cache/dir`` folder.
+a cached blob (the ``.cl_cache`` and ``.blob`` file for GPU and CPU respectively) is automatically
+created inside the ``/path/to/cache/dir`` folder.
 If the device does not support the import/export capability, cache is not created and no error is thrown.
 
 Note that the first ``compile_model`` operation takes slightly longer, as the cache needs to be created -

diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
@@ -705,23 +705,18 @@ void jit_gelu_tanh_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, con
     h->ld1r(vmm_aux1.s, table_val2("gelu_tanh_sqrt_two_over_pi"));
     h->fmul(vmm_aux0.s, vmm_aux1.s, vmm_aux2.s);
 
-    const bool store_src = vmm_src.getIdx() == vmm_dst.getIdx();
-    if (store_src) {
-        h->mov(vmm_aux2.b16, vmm_src.b16);
-    }
-
     tanh_emitter->emit_code(
             { vmm_aux0.getIdx() },
-            { vmm_aux0.getIdx() },
+            { vmm_aux2.getIdx() },
             aux_vec_idxs,
             aux_gpr_idxs);
 
     // compute 0.5 * x * (1 + tanh(G(x)))
     h->ld1r(vmm_aux1.s, table_val2("one"));
-    h->fadd(vmm_aux0.s, vmm_aux1.s, vmm_aux0.s);
+    h->fadd(vmm_aux0.s, vmm_aux1.s, vmm_aux2.s);
     h->ld1r(vmm_aux1.s, table_val2("half"));
     h->fmul(vmm_aux0.s, vmm_aux1.s, vmm_aux0.s);
-    h->fmul(vmm_dst.s, store_src ? vmm_aux2.s : vmm_src.s, vmm_aux0.s);
+    h->fmul(vmm_dst.s, vmm_src.s, vmm_aux0.s);
 }
 
 void jit_gelu_tanh_emitter::register_table_entries() {
@@ -1219,6 +1214,8 @@ jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host,
 
 size_t jit_mod_emitter::get_inputs_count() const { return 2; }
 
+size_t jit_mod_emitter::get_aux_vecs_count() const { return 1; }
+
 void jit_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
     if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) {
         emit_isa<dnnl::impl::cpu::aarch64::asimd>(in_vec_idxs, out_vec_idxs);
@@ -1233,14 +1230,15 @@ void jit_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std
 
     using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
 
-    TReg divend = TReg(in_vec_idxs[0]);
+    TReg dividend = TReg(in_vec_idxs[0]);
     TReg divisor = TReg(in_vec_idxs[1]);
     TReg r = TReg(out_vec_idxs[0]);
+    TReg aux = TReg(aux_vec_idxs[0]);
 
-    h->uni_fdiv(r.s, divend.s, divisor.s);
-    h->frintz(r.s, r.s);
-    h->uni_fmul(r.s, r.s, divisor.s);
-    h->uni_fsub(r.s, divend.s, r.s);
+    h->fdiv(aux.s, dividend.s, divisor.s);
+    h->frintz(aux.s, aux.s);
+    h->fmul(aux.s, aux.s, divisor.s);
+    h->fsub(r.s, dividend.s, aux.s);
 }
 
 std::set<std::vector<element::Type>> jit_mod_emitter::get_supported_precisions(const std::shared_ptr<ov::Node>& node) {
@@ -1874,7 +1872,7 @@ void jit_tanh_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const st
     TReg src = TReg(in_vec_idxs[0]);
     TReg dst = TReg(out_vec_idxs[0]);
 
-    TReg aux = TReg(aux_vec_idxs.back());
+    TReg aux = TReg(aux_vec_idxs[0]);
 
     h->ld1r(aux.s, table_val2("two"));
     h->uni_fmul(aux.s, src.s, aux.s);

diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
@@ -477,6 +477,8 @@ class jit_mod_emitter : public jit_emitter {
 
     size_t get_inputs_count() const override;
 
+    size_t get_aux_vecs_count() const override;
+
     static std::set<std::vector<element::Type>> get_supported_precisions(const std::shared_ptr<ov::Node>& node = nullptr);
 
 private: