Merge remote-tracking branch 'upstream/master' into tj/reference/mult…

…iclass_nms/vector-initialization-bug
openvinotoolkit · Oct 23, 2024 · 313f794 · 313f794
2 parents 3253d38 + a852c4a
commit 313f794
Show file tree

Hide file tree

Showing 22 changed files with 178 additions and 88 deletions.
diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml
@@ -3,11 +3,17 @@ name: Rerun Workflow with Known Errors
 on:
   workflow_run:
     workflows:
-      - Linux (Ubuntu 20.04, Python 3.11)
+      - Linux (Ubuntu 20.04, Python 3.9)
+      - Linux (Ubuntu 22.04, Python 3.11)
+      - Linux (Ubuntu 24.04, Python 3.12)
+      - Debian 10 ARM
+      - Android ARM64 with vcpkg
+      - Android x64
       - Linux ARM64 (Ubuntu 20.04, Python 3.11)
       - Linux Static CC (Ubuntu 22.04, Python 3.11, Clang)
       - Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10)
-      - Windows (VS 2019, Python 3.11)
+      - Windows (VS 2019, Python 3.11, Release)
+      - Windows (VS 2019, Python 3.11, Debug)
       - Windows Conditional Compilation (VS 2022, Python 3.11)
     types:
       - completed
@@ -56,6 +62,10 @@ jobs:
         if: ${{ env.PIPELINE_RETRIGGERED == 'true' }}
         run: echo "Rerun retriggered for ${{ github.event.workflow_run.html_url }} with ticket ${{ env.FOUND_ERROR_TICKET }}"
 
+      - name: ${{ github.event.workflow_run.html_url }}
+        if: ${{ env.PIPELINE_RETRIGGERED == 'true' }}
+        run: echo "Step for statistics gathering"
+
   rerunner_tests:
     name: Rerunner Tests
     if: ${{ github.event_name == 'pull_request' && github.repository_owner == 'openvinotoolkit' }}

diff --git a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
@@ -122,7 +122,7 @@ def process_coveo_meta(meta, url, link):
         namespace_element = ET.SubElement(url, namespace)
 
         for tag_name, tag_value in values.items():
-            if tag_name == 'ovcategory':
+            if tag_name == 'ovdoctype':
                 processed_link = process_link(link)
                 ET.SubElement(namespace_element, tag_name).text = processed_link
             else:

diff --git a/docs/sphinx_setup/_static/js/custom.js b/docs/sphinx_setup/_static/js/custom.js
@@ -417,13 +417,15 @@ document.addEventListener('DOMContentLoaded', function () {
             await searchInterfaceSa.initialize({
                 accessToken: "xx1f2aebd3-4307-4632-aeea-17c13378b237",
                 organizationId: "intelcorporationnonproduction2ybdyblf7",
+                organizationEndpoints: await searchInterface.getOrganizationEndpoints('intelcorporationnonproduction2ybdyblf7')
             });
             searchInterfaceSa.executeFirstSearch();
         }
         if (searchInterface) {
             await searchInterface.initialize({
                 accessToken: "xx1f2aebd3-4307-4632-aeea-17c13378b237",
                 organizationId: "intelcorporationnonproduction2ybdyblf7",
+                organizationEndpoints: await searchInterface.getOrganizationEndpoints('intelcorporationnonproduction2ybdyblf7')
             });
             searchInterface.executeFirstSearch();
         }

diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py
@@ -84,7 +84,7 @@
 ov_sitemap_meta = [
     ('coveo:metadata', {
         'ovversion': version_name,
-        'ovcategory': 'null'
+        'ovdoctype': 'null'
     })
 ]
 

diff --git a/src/bindings/js/node/tests/e2e/demo-electron-app/index.js b/src/bindings/js/node/tests/e2e/demo-electron-app/index.js
@@ -1,11 +1,39 @@
 const { app } = require('electron');
 const { addon: ov } = require('openvino-node');
 
-app.whenReady().then(() => {
-  console.log('Creating OpenVINO Runtime Core');
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  const core = new ov.Core();
-  console.log('Created OpenVINO Runtime Core');
+const epsilon = 0.5; // To avoid very small numbers
+const pathToModel = '../tests/unit/test_models/test_model_fp32.xml';
+
+main();
+
+async function main() {
+  await app.whenReady();
+
+  try {
+    console.log('Creating OpenVINO Runtime Core');
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    const core = new ov.Core();
+    console.log('Created OpenVINO Runtime Core');
+
+    const model = await core.readModel(pathToModel);
+    console.log('Model read successfully:', model);
+    const compiledModel = await core.compileModel(model, 'CPU');
+    const inferRequest = compiledModel.createInferRequest();
+    console.log('Infer request created:', inferRequest);
+
+    const tensorData = Float32Array.from(
+      { length: 3072 },
+      () => Math.random() + epsilon,
+    );
+    const tensor = new ov.Tensor(ov.element.f32, [1, 3, 32, 32], tensorData);
+    console.log('Tensor created:', tensor);
+
+    const result = await inferRequest.inferAsync([tensor]);
+    console.log('Infer request result:', result);
+  } catch (error) {
+    console.error('Error:', error);
+    app.exit(1);
+  }
 
   app.exit(0);
-});
+}
diff --git a/src/bindings/js/node/tests/e2e/electron-app.test.js b/src/bindings/js/node/tests/e2e/electron-app.test.js
@@ -1,24 +1,17 @@
 /* global describe, it, before, after */
 const fs = require('node:fs');
+const util = require('node:util');
 const assert = require('node:assert');
 const { exec } = require('child_process');
+const execPromise = util.promisify(exec);
+const { testModels, downloadTestModel } = require('../unit/utils.js');
 
 describe('E2E testing for OpenVINO as an Electron dependency.', function() {
   this.timeout(50000);
 
-  before((done) => {
-    exec(
-      'cp -r ./tests/e2e/demo-electron-app/ demo-electron-app-project',
-      (error) => {
-        if (error) {
-          console.error(`exec error: ${error}`);
-
-          return done(error);
-        }
-
-        done();
-      },
-    );
+  before(async () => {
+    await downloadTestModel(testModels.testModelFP32);
+    await execPromise('cp -r ./tests/e2e/demo-electron-app/ demo-electron-app-project');
   });
 
   it('should install dependencies', (done) => {
@@ -37,7 +30,7 @@ describe('E2E testing for OpenVINO as an Electron dependency.', function() {
   });
 
   it('should run electron package and verify output', (done) => {
-    exec('cd demo-electron-app-project && npm start', (error, stdout) => {
+    exec(`cd demo-electron-app-project && npm start`, (error, stdout) => {
       if (error) {
         console.error(`exec error: ${error}`);
 
@@ -48,6 +41,14 @@ describe('E2E testing for OpenVINO as an Electron dependency.', function() {
         stdout.includes('Created OpenVINO Runtime Core'),
         'Check that openvino-node operates fine',
       );
+      assert(
+        stdout.includes('Model read successfully: ModelWrap {}'),
+        'Check that model is read successfully',
+      );
+      assert(
+        stdout.includes('Infer request result: { fc_out: TensorWrap {} }'),
+        'Check that infer request result is successful',
+      );
       done();
     });
   });

diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt
@@ -1,5 +1,5 @@
 # used in multiple components
-numpy>=1.16.6,<2.1.0  # Python bindings, frontends
+numpy>=1.16.6,<2.2.0  # Python bindings, frontends
 
 # pytest
 pytest>=5.0,<8.4

diff --git a/src/bindings/python/requirements.txt b/src/bindings/python/requirements.txt
@@ -1,3 +1,3 @@
-numpy>=1.16.6,<2.1.0
+numpy>=1.16.6,<2.2.0
 openvino-telemetry>=2023.2.1
 packaging
diff --git a/...ugins/intel_cpu/src/utils/print_model.hpp → ...ude/transformations/utils/print_model.hpp b/...ugins/intel_cpu/src/utils/print_model.hpp → ...ude/transformations/utils/print_model.hpp
diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
@@ -549,6 +549,7 @@ int get_model_prefer_threads(const int num_streams,
             break;
         case dnnl::cpu_isa::avx512_core_vnni:
         case dnnl::cpu_isa::avx2_vnni:
+        case dnnl::cpu_isa::avx2_vnni_2:
             isaSpecificThreshold = 2.0f;
             break;
         case dnnl::cpu_isa::avx512_core_amx:

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -100,7 +100,7 @@
 #include "transformations/rt_info/keep_const_precision.hpp"
 #include "transformations/transpose_sinking/ts_shape_of.hpp"
 #include "utils/ngraph_transformation.hpp"
-#include "utils/print_model.hpp"
+#include "transformations/utils/print_model.hpp"
 
 // LPT transformations
 #include "low_precision/add.hpp"

diff --git a/src/plugins/intel_cpu/tests/unit/transformations/state_concat_sdpa.cpp b/src/plugins/intel_cpu/tests/unit/transformations/state_concat_sdpa.cpp
@@ -17,7 +17,7 @@
 #include <ov_ops/type_relaxed.hpp>
 
 #include "common_test_utils/ov_test_utils.hpp"
-#include "utils/print_model.hpp"
+#include "transformations/utils/print_model.hpp"
 
 using namespace testing;
 using namespace ov;

diff --git a/src/plugins/intel_cpu/thirdparty/ComputeLibrary b/src/plugins/intel_cpu/thirdparty/ComputeLibrary
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -952,6 +952,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
 
             // Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled
             // Calculate weight : w = (w - dzp) * ds
+            // if DECOMPRESSION_ZP_TERM is not enabled, then dzp is ACCUMULATOR_VAL_ZERO.
             #if DECOMPRESSION_ZP_TERM
                 #if DECOMPRESSION_ZP_SCALAR
                     DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE);
@@ -976,8 +977,6 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                         }
                     }
                 #endif
-            #else
-                DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO);
             #endif
 
             #if FILTER_LOAD_BLOCK_SIZE == 2
@@ -1026,7 +1025,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
 
             weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;
 
-            #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
+            #if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
                 unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
                     unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
                         const uint offset_ofm = out_f + fi*SIMD + sglid;
@@ -1046,7 +1045,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
             #endif
         }  // Whole tile_k elements of each iteration : ki
 
-        #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
+        #if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
             // Dynamic-quantizing group size set to same or smaller than scale group size
             if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) {
                 const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
@@ -1175,7 +1174,7 @@ KERNEL(fc)(
 #endif
 ) {
 #if USE_SLM
-    #if DYNAMIC_QUANTIZE && (TILE_OFM == 2)
+    #if DYNAMIC_QUANTIZE
         __local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD];
     #else
         __local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD];
@@ -1317,7 +1316,7 @@ KERNEL(fc)(
         #endif
         );
     } else {
-        #if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
+        #if USE_SLM && DYNAMIC_QUANTIZE
             FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
                 OPTIONAL_SHAPE_INFO_TENSOR
                 input,
@@ -1364,7 +1363,7 @@ KERNEL(fc)(
         #endif
     }
 #else
-    #if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
+    #if USE_SLM && DYNAMIC_QUANTIZE
         FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
             OPTIONAL_SHAPE_INFO_TENSOR
             input,

diff --git a/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -375,6 +375,9 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
 
     if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4) {
         if (!params.is_shape_agnostic && batch == 1) {
+            if (should_dynamic_quantize(params))
+                return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+
             // Tuning for Meteor Lake
             if (is_weight_vertical(params, output_f)) {
                 if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
@@ -616,7 +619,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
     // Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication
     if (should_dynamic_quantize(params)) {
         jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
-        jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
+        jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
         jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
     } else {

diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp
@@ -24,7 +24,7 @@
 namespace ov {
 namespace intel_gpu {
 
-ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8) {
+ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed() {
     using namespace ov::pass::pattern;
 
     auto compressed_constant = [](const ov::Output<ov::Node>& output) {
@@ -81,6 +81,12 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
         bool has_transpose = pattern_map.count(transpose_m);
         auto scale_shape = pattern_map.at(mul_const_m).get_shape();
         bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { return d > 1; }) > 1;
+        bool sub_with_convert = (pattern_map.count(sub_with_convert_m) > 0) ? true : false;
+
+        auto weight_ptr = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(weights_m).get_node_shared_ptr());
+        bool weight_u8 = false;
+        if (weight_ptr->get_element_type() == ov::element::u8 || weight_ptr->get_element_type() == ov::element::i8)
+            weight_u8 = true;
 
         auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr<ov::Node> node) {
             auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
@@ -97,11 +103,17 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
             return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
         };
 
-        auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr<ov::Node> node) {
+        auto convert_const_to_u8 = [&](std::shared_ptr<ov::Node> node) {
             auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
-            if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8)
+            // Convert ZP to u8
+            if (constant->get_element_type() == ov::element::u8)
                 return std::dynamic_pointer_cast<ov::Node>(constant);
-            return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));
+            if (constant->get_element_type() == ov::element::u4)
+                return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));
+            if (weight_u8 && sub_with_convert)
+                return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));
+
+            return std::dynamic_pointer_cast<ov::Node>(constant);
         };
 
 
@@ -111,8 +123,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
 
         const bool with_zero_point = pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0;
         if (with_zero_point) {
-            // WA: Convert ZP to u8 for OneDNN case to avoid u4 reorder
-            optional_zero_point = convert_u4const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr()));
+            optional_zero_point = convert_const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr()));
         }
 
         std::shared_ptr<ov::Node> fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr());

diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.hpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.hpp
@@ -12,7 +12,7 @@ namespace intel_gpu {
 class ConvertFullyConnectedToFullyConnectedCompressed: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0");
-    ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8 = false);
+    ConvertFullyConnectedToFullyConnectedCompressed();
 };
 
 }   // namespace intel_gpu

diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -810,7 +810,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ov::intel_gpu::ClampFP16Output>();
         manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
         manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
-        manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>(device_info.supports_immad);
+        manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();
 
         bool disable_horizontal_fc_fusion = false;
         GPU_DEBUG_GET_INSTANCE(debug_config);
@@ -819,10 +819,11 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
 
         if (!disable_horizontal_fc_fusion)
             manager.register_pass<ov::intel_gpu::FullyConnectedHorizontalFusion>();
+
+        // ZP should not be folded for FC. But still, ZP should be folded for Gather.
+        // Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
+        manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, true);
         if (device_info.supports_immad) {
-            // For OneDNN, ZP should not be folded for FC. But still, ZP should be folded for Gather.
-            // Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
-            manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, true);
             if (disable_horizontal_fc_fusion)
                 manager.register_pass<ov::pass::ConstantFolding>();
         }