From 3e63de016bced2a7ad18550fa5f8ca8fb47ba394 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Fri, 15 Nov 2024 20:57:31 +0800 Subject: [PATCH 01/53] [NPU] Remove template in ext wrapper and fuse functions (#27511) ### Details: - *Remove template in zero_ext_graph_wrappers* - *Remove zero_ext_graph_wrappers_interface.hpp* - *Add more low level debug log* - *Update level-zero-ext repo commit to use 1.9 version* ### Tickets: - *156387* --------- Signed-off-by: Xin Wang --- .../include/driver_compiler_adapter.hpp | 4 +- .../compiler_adapter/include/driver_graph.hpp | 6 +- .../include/plugin_compiler_adapter.hpp | 4 +- .../compiler_adapter/include/plugin_graph.hpp | 6 +- .../include/ze_graph_ext_wrappers.hpp | 112 +--- .../ze_graph_ext_wrappers_interface.hpp | 42 -- .../src/driver_compiler_adapter.cpp | 24 +- .../src/compiler_adapter/src/driver_graph.cpp | 2 +- .../src/plugin_compiler_adapter.cpp | 24 +- .../src/compiler_adapter/src/plugin_graph.cpp | 2 +- .../src/ze_graph_ext_wrappers.cpp | 527 ++++++++---------- .../intel_npu/thirdparty/level-zero-ext | 2 +- 12 files changed, 269 insertions(+), 486 deletions(-) delete mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp index dc000b99d7446b..82ababf21c147a 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp @@ -16,7 +16,7 @@ #include "intel_npu/config/config.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" -#include "ze_graph_ext_wrappers_interface.hpp" +#include "ze_graph_ext_wrappers.hpp" namespace intel_npu { @@ -54,7 +54,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter { std::string serializeConfig(const Config& config, ze_graph_compiler_version_info_t compilerVersion) const; std::shared_ptr _zeroInitStruct; - std::shared_ptr _zeGraphExt; + std::shared_ptr _zeGraphExt; ze_device_graph_properties_t _deviceGraphProperties = {}; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp index f7ea940cf9a160..0f426581687f65 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp @@ -10,13 +10,13 @@ #include "intel_npu/common/igraph.hpp" #include "intel_npu/utils/zero/zero_init.hpp" -#include "ze_graph_ext_wrappers_interface.hpp" +#include "ze_graph_ext_wrappers.hpp" namespace intel_npu { class DriverGraph final : public IGraph { public: - DriverGraph(const std::shared_ptr& zeGraphExt, + DriverGraph(const std::shared_ptr& zeGraphExt, const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, NetworkMetadata metadata, @@ -37,7 +37,7 @@ class DriverGraph final : public IGraph { private: bool release_blob(const Config& config); - std::shared_ptr _zeGraphExt; + std::shared_ptr _zeGraphExt; std::shared_ptr _zeroInitStruct; Logger _logger; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp index eab8a19627cd1c..8d2616884e7d5f 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp @@ -11,7 +11,7 @@ #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" #include "openvino/runtime/so_ptr.hpp" -#include "ze_graph_ext_wrappers_interface.hpp" +#include "ze_graph_ext_wrappers.hpp" namespace intel_npu { @@ -28,7 +28,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter { private: std::shared_ptr _zeroInitStruct; - std::shared_ptr _zeGraphExt; + std::shared_ptr _zeGraphExt; ov::SoPtr _compiler; Logger _logger; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp index 1028112368e67f..2d7d9bfd429e47 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp @@ -12,13 +12,13 @@ #include "intel_npu/icompiler.hpp" #include "intel_npu/utils/zero/zero_init.hpp" #include "openvino/runtime/so_ptr.hpp" -#include "ze_graph_ext_wrappers_interface.hpp" +#include "ze_graph_ext_wrappers.hpp" namespace intel_npu { class PluginGraph final : public IGraph { public: - PluginGraph(const std::shared_ptr& zeGraphExt, + PluginGraph(const std::shared_ptr& zeGraphExt, const ov::SoPtr& compiler, const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, @@ -38,7 +38,7 @@ class PluginGraph final : public IGraph { ~PluginGraph() override; private: - std::shared_ptr _zeGraphExt; + std::shared_ptr _zeGraphExt; std::shared_ptr _zeroInitStruct; const ov::SoPtr _compiler; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp index 1bc58b153a48ff..3e8c17ad13db7e 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp @@ -10,42 +10,19 @@ #include #include +#include "intel_npu/network_metadata.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" #include "intel_npu/utils/zero/zero_types.hpp" -#include "ze_graph_ext_wrappers_interface.hpp" namespace intel_npu { -#define NotSupportQuery(T) (T == ZE_GRAPH_EXT_VERSION_1_2) - -// ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, -// pfnQueryNetworkGetSupportedLayers) -#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) - -// ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) -#define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T))) - -// For ext version >= 1.5, pfnCreate2 api is avaible -#define NotSupportGraph2(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) - -// A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to -// "ze_graph_dditable_ext_1_6_t". -// See: E#117498 -#define NotSupportArgumentMetadata(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ - T == ZE_GRAPH_EXT_VERSION_1_5) - -#define UseCopyForNativeBinary(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ - T == ZE_GRAPH_EXT_VERSION_1_5 || T == ZE_GRAPH_EXT_VERSION_1_6) +using SerializedIR = std::pair>; /** * Adapter to use CiD through ZeroAPI */ -template -class ZeGraphExtWrappers final : public ZeGraphExtWrappersInterface { +class ZeGraphExtWrappers { public: ZeGraphExtWrappers(const std::shared_ptr& zeroInitStruct); ZeGraphExtWrappers(const ZeGraphExtWrappers&) = delete; @@ -53,105 +30,40 @@ class ZeGraphExtWrappers final : public ZeGraphExtWrappersInterface { ~ZeGraphExtWrappers(); std::unordered_set queryGraph(std::pair> serializedIR, - const std::string& buildFlags) const override; + const std::string& buildFlags) const; ze_graph_handle_t getGraphHandle(std::pair> serializedIR, const std::string& buildFlags, - const uint32_t& flags) const override; + const uint32_t& flags) const; - ze_graph_handle_t getGraphHandle(const std::vector& network) const override; + ze_graph_handle_t getGraphHandle(const std::vector& network) const; - NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const override; + NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const; - _ze_result_t destroyGraph(ze_graph_handle_t graphHandle) override; + _ze_result_t destroyGraph(ze_graph_handle_t graphHandle); void getGraphBinary(ze_graph_handle_t graphHandle, std::vector& blob, const uint8_t*& blobPtr, - size_t& blobSize) const override; + size_t& blobSize) const; - void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const override; + void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const; - void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const override; + void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const; private: - template = true> std::unordered_set getQueryResultFromSupportedLayers( ze_result_t result, ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - template = true> void getMetadata(ze_graph_handle_t graphHandle, uint32_t index, std::vector& inputs, std::vector& outputs) const; - template = true> - void getMetadata(ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const; - - template = true> - void getNativeBinary(ze_graph_handle_t graphHandle, - std::vector& blob, - const uint8_t*& blobPtr, - size_t& blobSize) const; - - template = true> - void getNativeBinary(ze_graph_handle_t graphHandle, - std::vector& /* unusedBlob */, - const uint8_t*& blobPtr, - size_t& blobSize) const; - - template = true> - ze_result_t queryNetworkCreateV2(std::pair> serializedIR, - const std::string& buildFlags, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) - template = true> - std::unordered_set queryImpl(std::pair> serializedIR, - const std::string& buildFlags) const; - - template = true> - ze_result_t queryNetworkCreateV1(std::pair> serializedIR, - const std::string& buildFlags, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, - // pfnQueryNetworkGetSupportedLayers) - template = true> - std::unordered_set queryImpl(std::pair> serializedIR, - const std::string& buildFlags) const; - - // For ext version < 1.3 - template = true> - std::unordered_set queryImpl(std::pair> serializedIR, - const std::string& buildFlags) const; - - template = true> - void createGraph(std::pair> serializedIR, - const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const; - - template = true> - void createGraph(std::pair> serializedIR, - const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const; - void initialize_graph_through_command_list(ze_graph_handle_t graphHandle, const Config& config) const; std::shared_ptr _zeroInitStruct; + uint32_t _graphExtVersion; Logger _logger; }; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp deleted file mode 100644 index ac44f9853e11e3..00000000000000 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "intel_npu/network_metadata.hpp" - -namespace intel_npu { - -using SerializedIR = std::pair>; - -class ZeGraphExtWrappersInterface { -public: - virtual std::unordered_set queryGraph(SerializedIR serializedIR, - const std::string& buildFlags) const = 0; - - virtual ze_graph_handle_t getGraphHandle(SerializedIR serializedIR, - const std::string& buildFlags, - const uint32_t& flags) const = 0; - - virtual ze_graph_handle_t getGraphHandle(const std::vector& network) const = 0; - - virtual NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const = 0; - - virtual _ze_result_t destroyGraph(ze_graph_handle_t graphHandle) = 0; - - virtual void getGraphBinary(ze_graph_handle_t graphHandle, - std::vector& blob, - const uint8_t*& blobPtr, - size_t& blobSize) const = 0; - - virtual void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const = 0; - - virtual void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const = 0; - - virtual ~ZeGraphExtWrappersInterface() = default; -}; - -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index b4da8a2bcc316b..f819ed73711cf2 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -155,29 +155,7 @@ DriverCompilerAdapter::DriverCompilerAdapter(const std::shared_ptr>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_4: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_5: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_6: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_7: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_8: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - default: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - } + _zeGraphExt = std::make_shared(_zeroInitStruct); _logger.info("initialize DriverCompilerAdapter complete, using graphExtVersion: %d.%d", ZE_MAJOR_VERSION(graphExtVersion), diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index 84759bf802f1c1..e1f3990b835e8d 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -10,7 +10,7 @@ namespace intel_npu { -DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt, +DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt, const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, NetworkMetadata metadata, diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp index 73dd3817e24812..06d71fd1126c17 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp @@ -70,29 +70,7 @@ PluginCompilerAdapter::PluginCompilerAdapter(const std::shared_ptr>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_4: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_5: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_6: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_7: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - case ZE_GRAPH_EXT_VERSION_1_8: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - default: - _zeGraphExt = std::make_shared>(_zeroInitStruct); - break; - } + _zeGraphExt = std::make_shared(_zeroInitStruct); _logger.info("initialize PluginCompilerAdapter complete, using graphExtVersion: %d.%d", ZE_MAJOR_VERSION(graphExtVersion), diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp index 8f60efd50af75c..c99069a0a9760f 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp @@ -10,7 +10,7 @@ namespace intel_npu { -PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, +PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, const ov::SoPtr& compiler, const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp index fad389ca30e0c7..f6366a2509747b 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp @@ -14,6 +14,25 @@ #include "intel_npu/utils/zero/zero_wrappers.hpp" #include "openvino/core/model.hpp" +#define NotSupportQuery(T) (T <= ZE_GRAPH_EXT_VERSION_1_2) + +// ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, +// pfnQueryNetworkGetSupportedLayers) +#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) + +// ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) +#define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T))) + +// For ext version >= 1.5, pfnCreate2 api is avaible +#define NotSupportGraph2(T) (T < ZE_GRAPH_EXT_VERSION_1_5) + +// A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to +// "ze_graph_dditable_ext_1_6_t". +// See: E#117498 +#define NotSupportArgumentMetadata(T) (T < ZE_GRAPH_EXT_VERSION_1_6) + +#define UseCopyForNativeBinary(T) (T < ZE_GRAPH_EXT_VERSION_1_7) + namespace { ov::element::Type_t toOVElementType(const ze_graph_argument_precision_t zeElementType) { @@ -63,19 +82,28 @@ ov::element::Type_t toOVElementType(const ze_graph_argument_precision_t zeElemen namespace intel_npu { -template -ZeGraphExtWrappers::ZeGraphExtWrappers(const std::shared_ptr& zeroInitStruct) +ZeGraphExtWrappers::ZeGraphExtWrappers(const std::shared_ptr& zeroInitStruct) : _zeroInitStruct(zeroInitStruct), - _logger("ZeGraphExtWrappers", Logger::global().level()) {} + _graphExtVersion(zeroInitStruct->getGraphDdiTable().version()), + _logger("ZeGraphExtWrappers", Logger::global().level()) { + _logger.info("Graph ext version used by zero wrapper: %d.%d", + ZE_MAJOR_VERSION(_graphExtVersion), + ZE_MINOR_VERSION(_graphExtVersion)); + _logger.debug("capabilities:"); + _logger.debug("-SupportQuery: %d", !NotSupportQuery(_graphExtVersion)); + _logger.debug("-SupportAPIGraphQueryNetworkV1: %d", SupportAPIGraphQueryNetworkV1(_graphExtVersion)); + _logger.debug("-SupportAPIGraphQueryNetworkV2 :%d", SupportAPIGraphQueryNetworkV2(_graphExtVersion)); + _logger.debug("-SupportpfnCreate2 :%d", !NotSupportGraph2(_graphExtVersion)); + _logger.debug("-SupportArgumentMetadata :%d", !NotSupportArgumentMetadata(_graphExtVersion)); + _logger.debug("-UseCopyForNativeBinary :%d", UseCopyForNativeBinary(_graphExtVersion)); +} -template -ZeGraphExtWrappers::~ZeGraphExtWrappers() { - _logger.debug("ZeGraphExtWrappers obj destroyed"); +ZeGraphExtWrappers::~ZeGraphExtWrappers() { + _logger.debug("Obj destroyed"); } -template -_ze_result_t ZeGraphExtWrappers::destroyGraph(ze_graph_handle_t graphHandle) { - _logger.debug("destroyGraph - pfnDestroy graphHandle"); +_ze_result_t ZeGraphExtWrappers::destroyGraph(ze_graph_handle_t graphHandle) { + _logger.debug("destroyGraph - perfrom pfnDestroy"); auto result = _zeroInitStruct->getGraphDdiTable().pfnDestroy(graphHandle); if (ZE_RESULT_SUCCESS != result) { @@ -87,73 +115,62 @@ _ze_result_t ZeGraphExtWrappers::destroyGraph(ze_graph_handle_t return result; } -template -template > -void ZeGraphExtWrappers::getNativeBinary(ze_graph_handle_t graphHandle, - std::vector& blob, - const uint8_t*& blobPtr, - size_t& blobSize) const { - // Get blob size first - auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, nullptr); - blob.resize(blobSize); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.", - result, - _zeroInitStruct->getGraphDdiTable()); - - // Get blob data - result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, blob.data()); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob data, Failed to compile network.", - result, - _zeroInitStruct->getGraphDdiTable()); - - blobPtr = blob.data(); -} - -template -template > -void ZeGraphExtWrappers::getNativeBinary(ze_graph_handle_t graphHandle, - std::vector& /* unusedBlob */, - const uint8_t*& blobPtr, - size_t& blobSize) const { - // Get blob ptr and size - auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary2(graphHandle, &blobSize, &blobPtr); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.", - result, - _zeroInitStruct->getGraphDdiTable()); -} - -template -void ZeGraphExtWrappers::getGraphBinary(ze_graph_handle_t graphHandle, - std::vector& blob, - const uint8_t*& blobPtr, - size_t& blobSize) const { +void ZeGraphExtWrappers::getGraphBinary(ze_graph_handle_t graphHandle, + std::vector& blob, + const uint8_t*& blobPtr, + size_t& blobSize) const { if (graphHandle == nullptr) { OPENVINO_THROW("Graph handle is null"); } - _logger.info("ZeGraphExtWrappers getGraphBinary get blob from graphHandle"); - - getNativeBinary(graphHandle, blob, blobPtr, blobSize); + _logger.debug("getGraphBinary - get blob from graphHandle"); + + if (UseCopyForNativeBinary(_graphExtVersion)) { + // Get blob size first + _logger.debug("getGraphBinary - perfrom pfnGetNativeBinary to get size"); + auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, nullptr); + blob.resize(blobSize); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.", + result, + _zeroInitStruct->getGraphDdiTable()); + + // Get blob data + _logger.debug("getGraphBinary - perfrom pfnGetNativeBinary to get data"); + result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, blob.data()); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob data, Failed to compile network.", + result, + _zeroInitStruct->getGraphDdiTable()); + + blobPtr = blob.data(); + } else { + // Get blob ptr and size + _logger.debug("getGraphBinary - perfrom pfnGetNativeBinary2 to get size and data"); + auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary2(graphHandle, &blobSize, &blobPtr); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.", + result, + _zeroInitStruct->getGraphDdiTable()); + } } -template -void ZeGraphExtWrappers::setGraphArgumentValue(ze_graph_handle_t graphHandle, - uint32_t argi, - const void* argv) const { +void ZeGraphExtWrappers::setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi, const void* argv) const { + _logger.debug("setGraphArgumentValue - perform pfnSetArgumentValue"); auto result = _zeroInitStruct->getGraphDdiTable().pfnSetArgumentValue(graphHandle, argi, argv); THROW_ON_FAIL_FOR_LEVELZERO_EXT("zeGraphSetArgumentValue", result, _zeroInitStruct->getGraphDdiTable()); } -template -void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const { +void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const { if (_zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8) { + _logger.debug("Use initialize_graph_through_command_list for ext version smaller than 1.8"); initialize_graph_through_command_list(graphHandle, config); } else { + _logger.debug("Initialize graph based on graph properties for ext version larger than 1.8"); ze_graph_properties_2_t properties = {}; properties.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES; + _logger.debug("initializeGraph - perfrom pfnGetProperties2"); _zeroInitStruct->getGraphDdiTable().pfnGetProperties2(graphHandle, &properties); if (properties.initStageRequired & ZE_GRAPH_STAGE_INITIALIZE) { + _logger.debug("initializeGraph - perfrom pfnGraphInitialize"); _zeroInitStruct->getGraphDdiTable().pfnGraphInitialize(graphHandle); } @@ -163,32 +180,31 @@ void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graph } } -template -void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle, - const Config& config) const { +void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle, + const Config& config) const { ze_device_properties_t deviceProperties = {}; deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties)); auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list init start - create graph_command_list"); + _logger.debug("initialize_graph_through_command_list init start - create graph_command_list"); CommandList graph_command_list(_zeroInitStruct, groupOrdinal); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - create graph_command_queue"); + _logger.debug("initialize_graph_through_command_list - create graph_command_queue"); CommandQueue graph_command_queue(_zeroInitStruct, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, groupOrdinal, false); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - create fence"); + _logger.debug("initialize_graph_through_command_list - create fence"); Fence fence(graph_command_queue); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - performing appendGraphInitialize"); + _logger.debug("initialize_graph_through_command_list - performing appendGraphInitialize"); graph_command_list.appendGraphInitialize(graphHandle); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - closing graph command list"); + _logger.debug("initialize_graph_through_command_list - closing graph command list"); graph_command_list.close(); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - performing executeCommandList"); + _logger.debug("initialize_graph_through_command_list - performing executeCommandList"); graph_command_queue.executeCommandList(graph_command_list, fence); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - performing hostSynchronize"); + _logger.debug("initialize_graph_through_command_list - performing hostSynchronize"); fence.hostSynchronize(); - _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - hostSynchronize completed"); + _logger.debug("initialize_graph_through_command_list - hostSynchronize completed"); } // Parse the result string of query from foramt to unordered_set of string @@ -210,102 +226,17 @@ static std::unordered_set parseQueryResult(std::vector& data) return result; } -// For ext version < 1.3, query is unsupported, return empty result and add debug log here -template -template > -std::unordered_set ZeGraphExtWrappers::queryImpl( - std::pair>, - const std::string&) const { - _logger.info("queryImpl - Driver version is less than 1.3, queryNetwork is unsupported."); - return std::unordered_set(); -} - -// For ext version == 1.3 && == 1.4 -template -template > -ze_result_t ZeGraphExtWrappers::queryNetworkCreateV1( - std::pair> serializedIR, - const std::string& buildFlags, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const { - ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NGRAPH_LITE, - serializedIR.first, - serializedIR.second.get(), - buildFlags.c_str()}; - - // Create querynetwork handle - ze_result_t result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate(_zeroInitStruct->getContext(), - _zeroInitStruct->getDevice(), - &desc, - &hGraphQueryNetwork); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("queryNetworkCreateV1", result, _zeroInitStruct->getGraphDdiTable()); - - return result; -} - -// For ext version == 1.3 && == 1.4, query is supported, calling querynetwork api in _zeroInitStruct->getGraphDdiTable() -template -template > -std::unordered_set ZeGraphExtWrappers::queryImpl( - std::pair> serializedIR, - const std::string& buildFlags) const { - _logger.info("queryImpl - Calling queryNetwork of 1.3 version."); - - ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr; - - auto result = queryNetworkCreateV1(std::move(serializedIR), buildFlags, hGraphQueryNetwork); - - return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork); -} - -// For ext version >= 1.5 -template -template > -ze_result_t ZeGraphExtWrappers::queryNetworkCreateV2( - std::pair> serializedIR, - const std::string& buildFlags, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const { - ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NGRAPH_LITE, - serializedIR.first, - serializedIR.second.get(), - buildFlags.c_str(), - ZE_GRAPH_FLAG_NONE}; - - // Create querynetwork handle - _logger.debug("queryNetworkCreateV2 - performing pfnQueryNetworkCreate2"); - ze_result_t result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate2(_zeroInitStruct->getContext(), - _zeroInitStruct->getDevice(), - &desc, - &hGraphQueryNetwork); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("queryNetworkCreateV2", result, _zeroInitStruct->getGraphDdiTable()); - - return result; -} - -// For ext version >= 1.5 -template -template > -std::unordered_set ZeGraphExtWrappers::queryImpl( - std::pair> serializedIR, - const std::string& buildFlags) const { - _logger.debug("queryImpl - Calling queryNetwork of 1.5 version."); - - ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr; - - auto result = queryNetworkCreateV2(std::move(serializedIR), buildFlags, hGraphQueryNetwork); - - return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork); -} - -template -template > -std::unordered_set ZeGraphExtWrappers::getQueryResultFromSupportedLayers( +std::unordered_set ZeGraphExtWrappers::getQueryResultFromSupportedLayers( ze_result_t result, ze_graph_query_network_handle_t& hGraphQueryNetwork) const { + if (NotSupportQuery(_graphExtVersion)) { + OPENVINO_THROW("pfnQueryNetworkGetSupportedLayers not supported for ", + ZE_MAJOR_VERSION(_graphExtVersion), + ".", + ZE_MINOR_VERSION(_graphExtVersion)); + } // Get the size of query result + _logger.debug("getQueryResultFromSupportLayers - perfrom pfnQueryNetworkGetSupportedLayers to get size"); size_t size = 0; result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkGetSupportedLayers(hGraphQueryNetwork, &size, nullptr); THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkGetSupportedLayers get size of query result", @@ -313,6 +244,7 @@ std::unordered_set ZeGraphExtWrappers::getQueryResu _zeroInitStruct->getGraphDdiTable()); // Get the result data of query + _logger.debug("getQueryResultFromSupportLayers - perfrom pfnQueryNetworkGetSupportedLayers to get data"); std::vector supportedLayers(size); result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkGetSupportedLayers(hGraphQueryNetwork, &size, @@ -321,80 +253,117 @@ std::unordered_set ZeGraphExtWrappers::getQueryResu result, _zeroInitStruct->getGraphDdiTable()); + _logger.debug("getQueryResultFromSupportLayers - perfrom pfnQueryNetworkDestroy"); result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkDestroy(hGraphQueryNetwork); THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkDestroy", result, _zeroInitStruct->getGraphDdiTable()); return parseQueryResult(supportedLayers); } -template -std::unordered_set ZeGraphExtWrappers::queryGraph( - std::pair> serializedIR, - const std::string& buildFlags) const { - return queryImpl(std::move(serializedIR), buildFlags); -} - -// For ext version <1.5, calling pfnCreate api in _zeroInitStruct->getGraphDdiTable() -template -template > -void ZeGraphExtWrappers::createGraph(std::pair> serializedIR, - const std::string& buildFlags, - const uint32_t& /*flags*/, - ze_graph_handle_t* graph) const { - ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NGRAPH_LITE, - serializedIR.first, - serializedIR.second.get(), - buildFlags.c_str()}; - - _logger.debug("createGraph - performing pfnCreate"); - // Create querynetwork handle - auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(), - _zeroInitStruct->getDevice(), - &desc, - graph); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate", result, _zeroInitStruct->getGraphDdiTable()); +std::unordered_set ZeGraphExtWrappers::queryGraph(std::pair> serializedIR, + const std::string& buildFlags) const { + // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) + // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, + // pfnQueryNetworkGetSupportedLayers) + // For ext version < 1.3, query is not supported + ze_result_t result = ZE_RESULT_SUCCESS; + if (NotSupportQuery(_graphExtVersion)) { + // For ext version < 1.3, query is unsupported, return empty result and add debug log here + _logger.warning("queryGraph - Driver version is less than 1.3, queryNetwork is unsupported."); + return std::unordered_set(); + } else if (SupportAPIGraphQueryNetworkV1(_graphExtVersion)) { + // For ext version == 1.3 && == 1.4, query is supported, calling querynetwork api in + // _zeroInitStruct->getGraphDdiTable() + ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr; + + // For ext version == 1.3 && == 1.4 + ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, + nullptr, + ZE_GRAPH_FORMAT_NGRAPH_LITE, + serializedIR.first, + serializedIR.second.get(), + buildFlags.c_str()}; + + // Create querynetwork handle + _logger.debug("For ext of 1.3 and 1.4 - perform pfnQueryNetworkCreate"); + result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate(_zeroInitStruct->getContext(), + _zeroInitStruct->getDevice(), + &desc, + &hGraphQueryNetwork); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkCreate", result, _zeroInitStruct->getGraphDdiTable()); + + return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork); + } else if (SupportAPIGraphQueryNetworkV2(_graphExtVersion)) { + // For ext version >= 1.5 + ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr; + + // For ext version >= 1.5 + ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, + nullptr, + ZE_GRAPH_FORMAT_NGRAPH_LITE, + serializedIR.first, + serializedIR.second.get(), + buildFlags.c_str(), + ZE_GRAPH_FLAG_NONE}; + + // Create querynetwork handle + _logger.debug("For ext larger than 1.4 - perform pfnQueryNetworkCreate2"); + result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate2(_zeroInitStruct->getContext(), + _zeroInitStruct->getDevice(), + &desc, + &hGraphQueryNetwork); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkCreate2", result, _zeroInitStruct->getGraphDdiTable()); + + return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork); + } + _logger.warning("queryGraph - Driver version is %d.%d, queryNetwork is unsupported.", + ZE_MAJOR_VERSION(_graphExtVersion), + ZE_MINOR_VERSION(_graphExtVersion)); + return std::unordered_set(); } -// For ext version >= 1.5, calling pfnCreate2 api in _zeroInitStruct->getGraphDdiTable() -template -template > -void ZeGraphExtWrappers::createGraph(std::pair> serializedIR, +ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair> serializedIR, const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const { - ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NGRAPH_LITE, - serializedIR.first, - serializedIR.second.get(), - buildFlags.c_str(), - flags}; - - _logger.debug("createGraph - performing pfnCreate2"); - // Create querynetwork handle - auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate2(_zeroInitStruct->getContext(), - _zeroInitStruct->getDevice(), - &desc, - graph); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate2", result, _zeroInitStruct->getGraphDdiTable()); -} - -template -ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle( - std::pair> serializedIR, - const std::string& buildFlags, - const uint32_t& flags) const { + const uint32_t& flags) const { ze_graph_handle_t graphHandle; - - createGraph(std::move(serializedIR), buildFlags, flags, &graphHandle); - + if (NotSupportGraph2(_graphExtVersion)) { + // For ext version <1.5, calling pfnCreate api in _zeroInitStruct->getGraphDdiTable() + ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, + nullptr, + ZE_GRAPH_FORMAT_NGRAPH_LITE, + serializedIR.first, + serializedIR.second.get(), + buildFlags.c_str()}; + + _logger.debug("getGraphHandle - perform pfnCreate"); + // Create querynetwork handle + auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(), + _zeroInitStruct->getDevice(), + &desc, + &graphHandle); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate", result, _zeroInitStruct->getGraphDdiTable()); + } else { + // For ext version >= 1.5, calling pfnCreate2 api in _zeroInitStruct->getGraphDdiTable() + ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, + nullptr, + ZE_GRAPH_FORMAT_NGRAPH_LITE, + serializedIR.first, + serializedIR.second.get(), + buildFlags.c_str(), + flags}; + + _logger.debug("getGraphHandle - perform pfnCreate2"); + // Create querynetwork handle + auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate2(_zeroInitStruct->getContext(), + _zeroInitStruct->getDevice(), + &desc, + &graphHandle); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate2", result, _zeroInitStruct->getGraphDdiTable()); + } return graphHandle; } -template -ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std::vector& network) const { +ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std::vector& network) const { ze_graph_handle_t graphHandle; if (network.empty()) { @@ -408,6 +377,7 @@ ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std:: network.data(), nullptr}; + _logger.debug("getGraphHandle - perform pfnCreate"); auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(), _zeroInitStruct->getDevice(), &desc, @@ -473,87 +443,74 @@ static IODescriptor getIODescriptor(const ze_graph_argument_properties_3_t& arg, metadata.has_value() ? std::optional(shapeFromIRModel) : std::nullopt}; } -template -template > -void ZeGraphExtWrappers::getMetadata(ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const { - ze_graph_argument_properties_3_t arg; - auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable()); - - switch (arg.type) { - case ZE_GRAPH_ARGUMENT_TYPE_INPUT: { - inputs.push_back(getIODescriptor(arg, std::nullopt)); - } break; - case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: { - outputs.push_back(getIODescriptor(arg, std::nullopt)); - } break; - default: { - OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ", arg.type); - } - } -} - -template -template > -void ZeGraphExtWrappers::getMetadata(ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const { - ze_graph_argument_properties_3_t arg; - auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable()); +void ZeGraphExtWrappers::getMetadata(ze_graph_handle_t graphHandle, + uint32_t index, + std::vector& inputs, + std::vector& outputs) const { + if (NotSupportArgumentMetadata(_graphExtVersion)) { + ze_graph_argument_properties_3_t arg; + _logger.debug("getMetadata - perfrom pfnGetArgumentProperties3"); + auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable()); + + switch (arg.type) { + case ZE_GRAPH_ARGUMENT_TYPE_INPUT: { + inputs.push_back(getIODescriptor(arg, std::nullopt)); + } break; + case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: { + outputs.push_back(getIODescriptor(arg, std::nullopt)); + } break; + default: { + OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ", + arg.type); + } + } + } else { + ze_graph_argument_properties_3_t arg; + _logger.debug("getMetadata - perfrom pfnGetArgumentProperties3"); + auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable()); - std::optional optionalMetadata = std::nullopt; + std::optional optionalMetadata = std::nullopt; - if (!isStateInputName(arg.name) && !isStateOutputName(arg.name) && !isShapeTensorName(arg.name)) { - ze_graph_argument_metadata_t metadata; - result = _zeroInitStruct->getGraphDdiTable().pfnGraphGetArgumentMetadata(graphHandle, index, &metadata); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGraphGetArgumentMetadata", result, _zeroInitStruct->getGraphDdiTable()); + if (!isStateInputName(arg.name) && !isStateOutputName(arg.name) && !isShapeTensorName(arg.name)) { + _logger.debug("getMetadata - perfrom pfnGetArgumentMetadata"); + ze_graph_argument_metadata_t metadata; + result = _zeroInitStruct->getGraphDdiTable().pfnGraphGetArgumentMetadata(graphHandle, index, &metadata); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGraphGetArgumentMetadata", result, _zeroInitStruct->getGraphDdiTable()); - optionalMetadata = std::optional(metadata); - } + optionalMetadata = std::optional(metadata); + } - switch (arg.type) { - case ZE_GRAPH_ARGUMENT_TYPE_INPUT: { - inputs.push_back(getIODescriptor(arg, optionalMetadata)); - } break; - case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: { - outputs.push_back(getIODescriptor(arg, optionalMetadata)); - } break; - default: { - OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ", arg.type); - } + switch (arg.type) { + case ZE_GRAPH_ARGUMENT_TYPE_INPUT: { + inputs.push_back(getIODescriptor(arg, optionalMetadata)); + } break; + case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: { + outputs.push_back(getIODescriptor(arg, optionalMetadata)); + } break; + default: { + OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ", + arg.type); + } + } } } -template -NetworkMetadata ZeGraphExtWrappers::getNetworkMeta(ze_graph_handle_t graphHandle) const { +NetworkMetadata ZeGraphExtWrappers::getNetworkMeta(ze_graph_handle_t graphHandle) const { ze_graph_properties_t graphProperties{}; + _logger.debug("getNetworkMeta - perfrom pfnGetProperties"); auto result = _zeroInitStruct->getGraphDdiTable().pfnGetProperties(graphHandle, &graphProperties); THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetProperties", result, _zeroInitStruct->getGraphDdiTable()); - NetworkMetadata meta; - for (uint32_t index = 0; index < graphProperties.numGraphArgs; ++index) { getMetadata(graphHandle, index, meta.inputs, meta.outputs); } // TODO: support this information in CiD [track: E#33479] meta.numStreams = 1; meta.bindRelatedDescriptors(); - return meta; } -template class ZeGraphExtWrappers; -template class ZeGraphExtWrappers; -template class ZeGraphExtWrappers; -template class ZeGraphExtWrappers; -template class ZeGraphExtWrappers; -template class ZeGraphExtWrappers; -template class ZeGraphExtWrappers; - } // namespace intel_npu diff --git a/src/plugins/intel_npu/thirdparty/level-zero-ext b/src/plugins/intel_npu/thirdparty/level-zero-ext index a6487cc2c5da9a..a63155ae4e64fe 160000 --- a/src/plugins/intel_npu/thirdparty/level-zero-ext +++ b/src/plugins/intel_npu/thirdparty/level-zero-ext @@ -1 +1 @@ -Subproject commit a6487cc2c5da9aa13db9e005a320a1b6a0ee5919 +Subproject commit a63155ae4e64feaaa6931f4696c2e2e699063875 From 037689031c7866be452d30be360205ee4331745f Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Fri, 15 Nov 2024 16:57:32 +0100 Subject: [PATCH 02/53] [DOCS] Updating Weight Compression Article (#27432) Rearranging information in the `LLM Weight Compression` article. --------- Co-authored-by: Karol Blaszczak --- .../weight-compression.rst | 669 +++++++++--------- .../4-bit-weight-quantization.rst | 175 +++++ 2 files changed, 492 insertions(+), 352 deletions(-) create mode 100644 docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst index bbc09ccd4b5fbb..046dde9661c3bb 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst @@ -6,38 +6,36 @@ LLM Weight Compression :hidden: weight-compression/microscaling-quantization + weight-compression/4-bit-weight-quantization -Weight compression is a technique for enhancing the efficiency of models, -especially those with large memory requirements. This method reduces the model's -memory footprint, a crucial factor for Large Language Models (LLMs). +Weight compression enhances the efficiency of models by reducing their memory footprint, +a crucial factor for Large Language Models (LLMs). It is especially effective for networks with high memory requirements. -Unlike full model quantization, where weights and activations are quantized, -weight compression in `Neural Network Compression Framework (NNCF) `__ -only targets the model's weights. This approach allows the activations to remain as -floating-point numbers, preserving most of the model's accuracy while improving its -speed and reducing its size. +Unlike full model quantization, where both weights and activations are quantized, it +only targets weights, keeping activations as floating-point numbers. This means preserving most +of the model's accuracy while improving its +speed and reducing its size. The reduction in size is especially noticeable with larger models. +For instance the 7 billion parameter Llama 2 model can be reduced +from about 25GB to 4GB using 4-bit weight compression. -The reduction in size is especially noticeable with larger models, -for instance the 7 billion parameter Llama 2 model can be reduced -from about 25GB to 4GB using 4-bit weight compression. With smaller models (i.e. less -than 1B parameters), weight compression may result in more accuracy reduction than -with larger models. +.. note:: + + With smaller language models (i.e. less than 1B parameters), weight + compression may result in more accuracy reduction than with larger models. + Therefore, weight compression is recommended for use with LLMs only. -LLMs and other models that require +LLMs and other GenAI models that require extensive memory to store the weights during inference can benefit from weight compression as it: * enables inference of exceptionally large models that cannot be accommodated in the device memory; - * reduces storage and memory overhead, making models more lightweight and less resource intensive for deployment; - * improves inference speed by reducing the latency of memory access when computing the operations with weights, for example, Linear layers. The weights are smaller and thus faster to load from memory; - * unlike quantization, does not require sample data to calibrate the range of activation values. @@ -46,197 +44,228 @@ provides weight quantization to 8 and 4-bit integer data types as a compression method primarily designed to optimize LLMs. +Compression Methods (8-bit vs. 4-bit) +##################################### + +For models that come from `Hugging Face `__ and are supported +by Optimum, it is recommended to use the **Optimum Intel API**, which employs NNCF weight +compression capabilities to optimize various large Transformer models. + +The NNCF ``nncf.compress_weights()`` API, with most of its options, is exposed in the +``.from_pretrained()`` method of Optimum Intel classes. Optimum also has several datasets +for data-aware quantization available out-of-the-box. -Compress Model Weights -###################### +You can use the examples below to perform data-free 8-bit or 4-bit weight quantization. +Before you start, make sure Optimum Intel is installed in your environment +by running the following command: -**8-bit weight quantization** method offers a balance between model size reduction and -maintaining accuracy, which usually leads to significant performance improvements for -Transformer-based models. Models with 8-bit compressed weights are performant on the -vast majority of supported CPU and GPU platforms. By default, weights are compressed -asymmetrically to "INT8_ASYM" mode. +.. code-block:: python + pip install optimum[openvino] -The code snippet below shows how to do asymmetrical 8-bit quantization of the model weights -represented in OpenVINO IR using NNCF: +**8-bit weight quantization** offers a good balance between reducing the size and lowering the +accuracy of a model. It usually results in significant improvements for transformer-based models +and guarantees good model performance for a vast majority of supported CPU and GPU platforms. +By default, weights are compressed asymmetrically to "INT8_ASYM" mode. .. tab-set:: - .. tab-item:: OpenVINO - :sync: openvino + .. tab-item:: Compression with Optimum-Intel + :sync: optimum - .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py - :language: python - :fragment: [compression_8bit] + Load a pre-trained Hugging Face model, compress it to INT8_ASYM, using the + Optimum Intel API, and then execute inference with a text phrase: + Simply use the optimum-cli command line tool: -Now, the model is ready for compilation and inference. -It can be also saved into a compressed format, resulting in a smaller binary file. + .. code-block:: console -**4-bit weight quantization** method stands for an INT4-INT8 mixed-precision weight quantization, -where INT4 is considered as the primary precision and asymmetric INT8 is the backup one. -It usually results in a smaller model size and lower inference latency, although the accuracy -degradation could be higher, depending on the model. + optimum-cli export openvino --model microsoft/Phi-3.5-mini-instruct --weight-format int8 ov_phi-3.5-mini-instruct -The code snippet below shows how to do 4-bit quantization of the model weights represented -in OpenVINO IR using NNCF: + You can also use the code sample to the same effect: -.. tab-set:: + .. code-block:: python - .. tab-item:: OpenVINO - :sync: openvino + from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig + from transformers import AutoTokenizer, pipeline - .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py - :language: python - :fragment: [compression_4bit] + # Load and compress a model from Hugging Face. + model_id = "microsoft/Phi-3.5-mini-instruct" + model = OVModelForCausalLM.from_pretrained( + model_id, + export=True, + quantization_config=OVWeightQuantizationConfig(bits=8) + ) + # Inference + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + phrase = "The weather is" + results = pipe(phrase) + print(results) -The table below summarizes the benefits and trade-offs for each compression type in terms of -memory reduction, speed gain, and accuracy loss. + For more details, refer to the article on how to + :doc:`infer LLMs using Optimum Intel <../../learn-openvino/llm_inference_guide/llm-inference-hf>`. -.. list-table:: - :widths: 25 20 20 20 - :header-rows: 1 + .. tab-item:: Compression with NNCF + :sync: nncf - * - - - Memory Reduction - - Latency Improvement - - Accuracy Loss - * - INT8 Asymmetric - - Low - - Medium - - Low - * - INT4 Symmetric - - High - - High - - High - * - INT4 Asymmetric - - High - - Medium - - Medium + Load a pre-trained Hugging Face model, using the Optimum Intel API, + compress it to INT8_ASYM, using NNCF, and then execute inference with a text phrase: + .. code-block:: python + from nncf import compress_weights, CompressWeightsMode + from optimum.intel.openvino import OVModelForCausalLM + from transformers import AutoTokenizer, pipeline -The INT4 method has several parameters that can provide different performance-accuracy -trade-offs after optimization: + # Load a model and compress it with NNCF. + model_id = "microsoft/Phi-3.5-mini-instruct" + model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=False, compile=False) + model.model = compress_weights(model.model, mode=CompressWeightsMode.INT8_ASYM) -* ``mode`` - there are two optimization modes: symmetric and asymmetric. + # Inference + model.compile() + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + phrase = "The weather is" + results = pipe(phrase) + print(results) - **Symmetric Compression** - ``INT4_SYM`` - INT4 Symmetric mode involves quantizing weights to a signed 4-bit integer - symmetrically without zero point. This mode is faster than the INT8_ASYM, making - it ideal for situations where **speed and size reduction are prioritized over accuracy**. +Here is an example of code using NNCF to perform asymmetrical 8-bit weight quantization of +a model in the OpenVINO IR format: - .. code-block:: python +.. tab-set:: - from nncf import compress_weights - from nncf import CompressWeightsMode + .. tab-item:: OpenVINO + :sync: openvino - compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM) + .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py + :language: python + :fragment: [compression_8bit] - **Asymmetric Compression** - ``INT4_ASYM`` - INT4 Asymmetric mode also uses an unsigned 4-bit integer but quantizes weights - asymmetrically with a non-fixed zero point. This mode slightly compromises speed in - favor of better accuracy compared to the symmetric mode. This mode is useful when - **minimal accuracy loss is crucial**, but a faster performance than INT8 is still desired. +**4-bit weight quantization** is actually a mixed-precision compression, +primarily INT4 and a backup asymmetric INT8 precisions. It produces a smaller model, +offering lower inference latency but potentially noticeable accuracy degradation, +depending on the model. - .. code-block:: python +.. tab-set:: - from nncf import compress_weights - from nncf import CompressWeightsMode + .. tab-item:: Compression with Optimum-Intel + :sync: optimum - compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_ASYM) + Load a pre-trained Hugging Face model, compress it to INT4, using the + Optimum Intel API, and then execute inference with a text phrase: -* ``group_size`` controls the size of the group of weights that share the same - quantization parameters. Shared quantization parameters help to speed up the - calculation of activation values as they are dequantized and quantized between - layers. However, they can reduce accuracy. The following group sizes are - recommended: ``128``, ``64``, ``32`` (``128`` is default value). + Simply use the optimum-cli command line tool: - `Smaller Group Size`: Leads to a more accurate model but increases the model's - footprint and reduces inference speed. + .. code-block:: console - `Larger Group Size`: Results in faster inference and a smaller model, but might - compromise accuracy. + optimum-cli export openvino --model microsoft/Phi-3.5-mini-instruct --weight-format int4 --awq --scale-estimation --dataset wikitext2 --group-size 64 --ratio 1.0 ov_phi-3.5-mini-instruct -* ``ratio`` controls the ratio between the layers compressed to the precision defined - by ``mode`` and the rest of the layers that will be kept in the ``backup_mode`` in the optimized model. - Ratio is a decimal between 0 and 1. For example, 0.8 means that 80% of layers will be - compressed to the precision defined by ``mode``, while the rest will be compressed to - ``backup_mode`` precision. The default value for ratio is 1. + You can also use the code sample to the same effect: - `Higher Ratio (more layers set to mode precision)`: Reduces the model size and increase inference speed but - might lead to higher accuracy degradation. + .. code-block:: python - `Lower Ratio (more layers set to backup_mode precision)`: Maintains better accuracy but results in a larger model size - and potentially slower inference. + from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig + from transformers import AutoTokenizer, pipeline - In this example, 90% of the model's layers are quantized to INT4 asymmetrically with - a group size of 64: + # Load and compress a model from Hugging Face. + model_id = "microsoft/Phi-3.5-mini-instruct" + model = OVModelForCausalLM.from_pretrained( + model_id, + export=True, + quantization_config=OVWeightQuantizationConfig( + bits=4, + quant_method="awq", + scale_estimation=True, + dataset="wikitext2", + group_size=64, + ratio=1.0 + ) + ) - .. code-block:: python + # Inference + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + phrase = "The weather is" + results = pipe(phrase) + print(results) - from nncf import compress_weights, CompressWeightsMode + .. tab-item:: Compression with NNCF + :sync: nncf - # Example: Compressing weights with INT4_ASYM mode, group size of 64, and 90% INT4 ratio - compressed_model = compress_weights( - model, - mode=CompressWeightsMode.INT4_ASYM, - group_size=64, - ratio=0.9, - ) + Load a pre-trained Hugging Face model, using the Optimum Intel API, + compress it to INT4 using NNCF, and then execute inference with a text phrase: -* ``scale_estimation`` - boolean parameter that enables more accurate estimation of - quantization scales. Especially helpful when the weights of all layers are quantized to - 4 bits. Requires dataset. + .. code-block:: python -* ``awq`` - boolean parameter that enables the AWQ method for more accurate INT4 weight - quantization. Especially helpful when the weights of all the layers are quantized to - 4 bits. The method can sometimes result in reduced accuracy when used with - Dynamic Quantization of activations. Requires dataset. + from nncf import compress_weights, CompressWeightsMode + from optimum.intel.openvino import OVModelForCausalLM + from transformers import AutoTokenizer, pipeline -* ``gptq`` - boolean parameter that enables the GPTQ method for more accurate INT4 weight - quantization. Requires dataset. + # Load a model and compress it with NNCF. + model_id = "microsoft/Phi-3.5-mini-instruct" + model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=False, compile=False) + model.model = compress_weights(model.model, mode=CompressWeightsMode.INT4_SYM) -* ``dataset`` - calibration dataset for data-aware weight compression. It is required - for some compression options, for example, ``scale_estimation``, ``gptq`` or ``awq``. Some types - of ``sensitivity_metric`` can use data for precision selection. + # Inference + model.compile() + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + phrase = "The weather is" + results = pipe(phrase) + print(results) -* ``sensitivity_metric`` - controls the metric to estimate the sensitivity of compressing - layers in the bit-width selection algorithm. Some of the metrics require dataset to be - provided. The following types are supported: - * ``nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR`` - data-free metric computed as - the inverted 8-bit quantization noise. Weights with highest value of this metric can - be accurately quantized channel-wise to 8-bit. The idea is to leave these weights in - 8 bit, and quantize the rest of layers to 4-bit group-wise. Since group-wise is more - accurate than per-channel, accuracy should not degrade. + For more details, refer to the article on how to + :doc:`infer LLMs using Optimum Intel <../../../learn-openvino/llm_inference_guide/llm-inference-hf>`. - * ``nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION`` - requires dataset. The average - Hessian trace of weights with respect to the layer-wise quantization error multiplied - by L2 norm of 8-bit quantization noise. +The code snippet below shows how to do 4-bit quantization of the model weights represented +in OpenVINO IR using NNCF: - * ``nncf.SensitivityMetric.MEAN_ACTIVATION_VARIANCE`` - requires dataset. The mean - variance of the layers' inputs multiplied by inverted 8-bit quantization noise. +.. tab-set:: - * ``nncf.SensitivityMetric.MAX_ACTIVATION_VARIANCE`` - requires dataset. The maximum - variance of the layers' inputs multiplied by inverted 8-bit quantization noise. + .. tab-item:: OpenVINO + :sync: openvino - * ``nncf.SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE`` - requires dataset. The mean - magnitude of the layers' inputs multiplied by inverted 8-bit quantization noise. + .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py + :language: python + :fragment: [compression_4bit] + +Refer to the article about +:doc:`4-bit weight quantization <./weight-compression/4-bit-weight-quantization>` +for more details. -* ``all_layers`` - boolean parameter that enables INT4 weight quantization of all - Fully-Connected and Embedding layers, including the first and last layers in the model. +Once the model has been optimized, it is ready for compilation and inference. The model can +also be :ref:`saved into a compressed format `, resulting in a +smaller binary file. + +The table below summarizes the benefits and trade-offs for each compression type in terms of +memory reduction, speed gain, and accuracy loss. -* ``lora_correction`` - boolean parameter that enables the LoRA Correction Algorithm - to further improve the accuracy of INT4 compressed models on top of other - algorithms - AWQ and Scale Estimation. +.. list-table:: + :widths: 25 20 20 20 + :header-rows: 1 -* ``backup_mode`` - defines a backup precision for mixed-precision weight compression. - There are three modes: INT8_ASYM, INT8_SYM, and NONE, which retains - the original floating-point precision of the model weights (``INT8_ASYM`` is default value). + * - + - Memory Reduction + - Latency Improvement + - Accuracy Loss + * - INT8 Asymmetric + - Low + - Medium + - Low + * - INT4 Symmetric + - High + - High + - High + * - INT4 Asymmetric + - High + - Medium + - Medium **Use synthetic data for LLM weight compression** @@ -268,8 +297,8 @@ for details of the usage. # Synthetic-based compression synthetic_dataset = nncf.data.generate_text_data(hf_model, tokenizer, dataset_size=100) quantization_dataset = nncf.Dataset( - synthetic_dataset, - transform_fn # see example in NNCF repo how to make transform_fn + synthetic_dataset, + transform_fn # See the example in NNCF repo to learn how to make transform_fn. ) model = compress_weights( @@ -280,58 +309,16 @@ for details of the usage. dataset=quantization_dataset, awq=True, scale_estimation=True - ) # model is openvino.Model + ) # The model is openvino.Model. For data-aware weight compression refer to the following `example `__. .. note:: - Some methods can be stacked on top of one another to achieve a better - accuracy-performance trade-off after weight quantization. For example, the **Scale Estimation** - method can be applied along with **AWQ** and mixed-precision quantization (the ``ratio`` parameter). - - -**Hugging Face Optimum-Intel API** - -Hugging Face Optimum-Intel provides an easy way to use NNCF Weight Compression capabilities to optimize -various large Transformer models. Most of the options of the NNCF ``nncf.compress_weights()`` API are -exposed in the ``.from_pretrained()`` method of Optimum-Intel classes. Optimum also has several datasets -for data-aware quantization available out-of-the-box. -The example below shows data-free 4-bit weight quantization -applied on top of OpenVINO IR. Before trying the example, make sure Optimum Intel -is installed in your environment by running the following command: - -.. code-block:: python - - pip install optimum[openvino] - -.. code-block:: python - - from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig - from transformers import AutoTokenizer, pipeline - - # Load and compress model from Hugging Face - model_id = "microsoft/Phi-3.5-mini-instruct" - model = OVModelForCausalLM.from_pretrained( - model_id, - export=True, - quantization_config=OVWeightQuantizationConfig( - bits=4, - quant_method="awq", - scale_estimation=True, - dataset="wikitext2", - group_size=64, - ratio=1.0 - ) - ) - - # Inference - tokenizer = AutoTokenizer.from_pretrained(model_id) - pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) - phrase = "The weather is" - results = pipe(phrase) - print(results) + Some methods can be stacked on top of one another to achieve a better + accuracy-performance trade-off after weight quantization. For example, the **Scale Estimation** + method can be applied along with **AWQ** and mixed-precision quantization (the ``ratio`` parameter). Exporting and Loading Compressed Models @@ -344,179 +331,157 @@ so it is preferable to compress the model once, save it, and then load the compressed model later for faster time to first inference. .. code-block:: python + :name: save_pretrained - # Save compressed model for faster loading later - model.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") - tokenizer.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") - - # Load a saved model - model = OVModelForCausalLM.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") - tokenizer = AutoTokenizer.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") - -GPTQ Models -############ + # Save compressed model for faster loading later + model.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") + tokenizer.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") -OpenVINO also supports 4-bit models from Hugging Face -`Transformers `__ library optimized -with `GPTQ `__. In this case, there is no -need for an additional model optimization step because model conversion will -automatically preserve the INT4 optimization results, allowing model inference to benefit from it. + # Load a saved model + model = OVModelForCausalLM.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") + tokenizer = AutoTokenizer.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov") -A compression example using a GPTQ model is shown below. -Make sure to install GPTQ dependencies by running the following command: +.. tip:: -.. code-block:: python - - pip install optimum[openvino] auto-gptq - -.. code-block:: python + Models optimized with with NNCF or Optimum Intel can be used with + :doc:`OpenVINO GenAI <../../learn-openvino/llm_inference_guide/genai-guide>`. - from optimum.intel.openvino import OVModelForCausalLM - from transformers import AutoTokenizer, pipeline - # Load model from Hugging Face already optimized with GPTQ - model_id = "TheBloke/Llama-2-7B-Chat-GPTQ" - model = OVModelForCausalLM.from_pretrained(model_id, export=True) +Auto-tuning of Weight Compression Parameters +############################################ - # Inference - tokenizer = AutoTokenizer.from_pretrained(model_id) - pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) - phrase = "The weather is" - results = pipe(phrase) - print(results) +To find the optimal weight compression parameters for a particular model, refer to the +`example `__ , +where weight compression parameters are being searched from the subset of values. +To speed up the search, a self-designed validation pipeline called +`WhoWhatBench `__ +is used. The pipeline can quickly evaluate the changes in the accuracy of the optimized +model compared to the baseline. -An `example of a model `__ -that has been optimized using GPTQ. Compression Metrics Examples -######################################## +############################ -The table below shows examples of text-generation Language Models with different +Below you will find examples of text-generation Language Models with different optimization settings in a data-free setup, where no dataset is used at the optimization step. The Perplexity metric is a measurement of response accuracy, where a higher complexity score indicates a lower accuracy. It is measured on the `Lambada OpenAI dataset `__. -.. list-table:: - :widths: 40 55 25 25 - :header-rows: 1 - - * - Model - - Optimization - - Perplexity\* - - Model Size (Gb) - * - databricks/dolly-v2-3b - - FP32 - - 5.01 - - 10.3 - * - databricks/dolly-v2-3b - - INT8_ASYM - - 5.07 - - 2.6 - * - databricks/dolly-v2-3b - - INT4_ASYM,group_size=32,ratio=0.5 - - 5.28 - - 2.2 - * - facebook/opt-6.7b - - FP32 - - 4.25 - - 24.8 - * - facebook/opt-6.7b - - INT8_ASYM - - 4.27 - - 6.2 - * - facebook/opt-6.7b - - INT4_ASYM,group_size=64,ratio=0.8 - - 4.32 - - 4.1 - * - meta-llama/Llama-2-7b-chat-hf - - FP32 - - 3.28 - - 25.1 - * - meta-llama/Llama-2-7b-chat-hf - - INT8_ASYM - - 3.29 - - 6.3 - * - meta-llama/Llama-2-7b-chat-hf - - INT4_ASYM,group_size=128,ratio=0.8 - - 3.41 - - 4.0 - * - togethercomputer/RedPajama-INCITE-7B-Instruct - - FP32 - - 4.15 - - 25.6 - * - togethercomputer/RedPajama-INCITE-7B-Instruct - - INT8_ASYM - - 4.17 - - 6.4 - * - togethercomputer/RedPajama-INCITE-7B-Instruct - - INT4_ASYM,group_size=128,ratio=1.0 - - 4.17 - - 3.6 - * - meta-llama/Llama-2-13b-chat-hf - - FP32 - - 2.92 - - 48.5 - * - meta-llama/Llama-2-13b-chat-hf - - INT8_ASYM - - 2.91 - - 12.1 - * - meta-llama/Llama-2-13b-chat-hf - - INT4_SYM,group_size=64,ratio=0.8 - - 2.98 - - 8.0 - - -The following table shows accuracy metric in a data-aware 4-bit weight quantization -setup measured on the `Wikitext dataset `__. - -.. list-table:: - :widths: 40 55 25 25 - :header-rows: 1 - - * - Model - - Optimization - - Word perplexity\* - - Model Size (Gb) - * - meta-llama/llama-7b-chat-hf - - FP32 - - 11.57 - - 12.61 - * - meta-llama/llama-7b-chat-hf - - INT4_SYM,group_size=128,ratio=1.0,awq=True - - 12.34 - - 2.6 - * - stabilityai_stablelm-3b-4e1t - - FP32 - - 10.17 - - 10.41 - * - stabilityai_stablelm-3b-4e1t - - INT4_SYM,group_size=64,ratio=1.0,awq=True - - 10.89 - - 2.6 - * - HuggingFaceH4/zephyr-7b-beta - - FP32 - - 9.82 - - 13.99 - * - HuggingFaceH4/zephyr-7b-beta - - INT4_SYM,group_size=128,ratio=1.0 - - 10.32 - - 2.6 +.. dropdown:: Perplexity\* in data-free optimization + + .. list-table:: + :widths: 40 55 25 25 + :header-rows: 1 + + * - Model + - Optimization + - Perplexity\* + - Model Size (Gb) + * - databricks/dolly-v2-3b + - FP32 + - 5.01 + - 10.3 + * - databricks/dolly-v2-3b + - INT8_ASYM + - 5.07 + - 2.6 + * - databricks/dolly-v2-3b + - INT4_ASYM,group_size=32,ratio=0.5 + - 5.28 + - 2.2 + * - facebook/opt-6.7b + - FP32 + - 4.25 + - 24.8 + * - facebook/opt-6.7b + - INT8_ASYM + - 4.27 + - 6.2 + * - facebook/opt-6.7b + - INT4_ASYM,group_size=64,ratio=0.8 + - 4.32 + - 4.1 + * - meta-llama/Llama-2-7b-chat-hf + - FP32 + - 3.28 + - 25.1 + * - meta-llama/Llama-2-7b-chat-hf + - INT8_ASYM + - 3.29 + - 6.3 + * - meta-llama/Llama-2-7b-chat-hf + - INT4_ASYM,group_size=128,ratio=0.8 + - 3.41 + - 4.0 + * - togethercomputer/RedPajama-INCITE-7B-Instruct + - FP32 + - 4.15 + - 25.6 + * - togethercomputer/RedPajama-INCITE-7B-Instruct + - INT8_ASYM + - 4.17 + - 6.4 + * - togethercomputer/RedPajama-INCITE-7B-Instruct + - INT4_ASYM,group_size=128,ratio=1.0 + - 4.17 + - 3.6 + * - meta-llama/Llama-2-13b-chat-hf + - FP32 + - 2.92 + - 48.5 + * - meta-llama/Llama-2-13b-chat-hf + - INT8_ASYM + - 2.91 + - 12.1 + * - meta-llama/Llama-2-13b-chat-hf + - INT4_SYM,group_size=64,ratio=0.8 + - 2.98 + - 8.0 + + +.. dropdown:: Perplexity\* in data-aware optimization + + The following table shows accuracy metric in a data-aware 4-bit weight quantization + setup measured on the `Wikitext dataset `__. + + .. list-table:: + :widths: 40 55 25 25 + :header-rows: 1 + + * - Model + - Optimization + - Word perplexity\* + - Model Size (Gb) + * - meta-llama/llama-7b-chat-hf + - FP32 + - 11.57 + - 12.61 + * - meta-llama/llama-7b-chat-hf + - INT4_SYM,group_size=128,ratio=1.0,awq=True + - 12.34 + - 2.6 + * - stabilityai_stablelm-3b-4e1t + - FP32 + - 10.17 + - 10.41 + * - stabilityai_stablelm-3b-4e1t + - INT4_SYM,group_size=64,ratio=1.0,awq=True + - 10.89 + - 2.6 + * - HuggingFaceH4/zephyr-7b-beta + - FP32 + - 9.82 + - 13.99 + * - HuggingFaceH4/zephyr-7b-beta + - INT4_SYM,group_size=128,ratio=1.0 + - 10.32 + - 2.6 \*Perplexity metric in both tables was measured without the Dynamic Quantization feature enabled in the OpenVINO runtime. -Auto-tuning of Weight Compression Parameters -############################################ - -To find the optimal weight compression parameters for a particular model, refer to the -`example `__ , -where weight compression parameters are being searched from the subset of values. -To speed up the search, a self-designed validation pipeline called -`WhoWhatBench `__ -is used. The pipeline can quickly evaluate the changes in the accuracy of the optimized -model compared to the baseline. Additional Resources #################### diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst new file mode 100644 index 00000000000000..ae9bc7d7b8b4a3 --- /dev/null +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst @@ -0,0 +1,175 @@ +4-bit Weight Quantization +========================= + +The 4-bit weight quantization method results in significant reduction in model size and +memory usage, making LLMs more accessible to less performant devices. +It also usually offers lower inference latency, however, depending on specific models, +it may potentially impact the accuracy. + +Nevertheless, the INT4 method has several parameters that can provide different performance-accuracy +trade-offs after optimization: + +* ``mode`` - there are two optimization modes: symmetric and asymmetric. + + .. tab-set:: + + .. tab-item:: Symmetric Compression + :sync: int4-sym + + INT4 Symmetric mode (``INT4_SYM``) involves quantizing weights to a signed 4-bit integer + symmetrically without zero point. This mode is faster than the INT8_ASYM, making + it ideal for situations where **speed and size reduction are prioritized over accuracy**. + + .. code-block:: python + + from nncf import compress_weights + from nncf import CompressWeightsMode + + compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM) + + .. tab-item:: Asymmetric Compression + :sync: int4-asym + + INT4 Asymmetric mode (``INT4_ASYM``) also uses an unsigned 4-bit integer but quantizes weights + asymmetrically with a non-fixed zero point. This mode slightly compromises speed in + favor of better accuracy compared to the symmetric mode. This mode is useful when + **minimal accuracy loss is crucial**, but a faster performance than INT8 is still desired. + + .. code-block:: python + + from nncf import compress_weights + from nncf import CompressWeightsMode + + compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_ASYM) + +* ``group_size`` controls the size of the group of weights that share the same + quantization parameters. Shared quantization parameters help to speed up the + calculation of activation values as they are dequantized and quantized between + layers. However, they can reduce accuracy. The following group sizes are + recommended: ``128``, ``64``, ``32`` (``128`` is default value). + + `Smaller Group Size`: Leads to a more accurate model but increases the model's + footprint and reduces inference speed. + + `Larger Group Size`: Results in faster inference and a smaller model, but might + compromise accuracy. + +* ``ratio`` controls the ratio between the layers compressed to the precision defined + by ``mode`` and the rest of the layers that will be kept in the ``backup_mode`` in the optimized model. + Ratio is a decimal between 0 and 1. For example, 0.8 means that 80% of layers will be + compressed to the precision defined by ``mode``, while the rest will be compressed to + ``backup_mode`` precision. The default value for ratio is 1. + + | **Higher Ratio (more layers set to mode precision)**: + | Reduces the model size and increase inference speed but + might lead to higher accuracy degradation. + + | **Lower Ratio (more layers set to backup_mode precision)**: + | Maintains better accuracy but results in a larger model size + and potentially slower inference. + + In the example below, 90% of the model's layers are quantized to INT4 asymmetrically with + a group size of 64: + + .. code-block:: python + + from nncf import compress_weights, CompressWeightsMode + + # Example: Compressing weights with INT4_ASYM mode, group size of 64, and 90% INT4 ratio + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.INT4_ASYM, + group_size=64, + ratio=0.9, + ) + +* ``scale_estimation`` - a boolean parameter that enables more accurate estimation of + quantization scales. Especially helpful when the weights of all layers are quantized to + 4 bits. Requires dataset. + +* ``awq`` - a boolean parameter that enables the AWQ method for more accurate INT4 weight + quantization. Especially helpful when the weights of all the layers are quantized to + 4 bits. The method can sometimes result in reduced accuracy when used with + Dynamic Quantization of activations. Requires dataset. + +* ``gptq`` - a boolean parameter that enables the GPTQ method for more accurate INT4 weight + quantization. Requires dataset. + +* ``dataset`` - a calibration dataset for data-aware weight compression. It is required + for some compression options, for example, ``scale_estimation``, ``gptq`` or ``awq``. Some types + of ``sensitivity_metric`` can use data for precision selection. + +* ``sensitivity_metric`` - controls the metric to estimate the sensitivity of compressing + layers in the bit-width selection algorithm. Some of the metrics require dataset to be + provided. The following types are supported: + + * ``nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR`` - a data-free metric computed as + the inverted 8-bit quantization noise. Weights with highest value of this metric can + be accurately quantized channel-wise to 8-bit. The idea is to leave these weights in + 8 bit, and quantize the rest of layers to 4-bit group-wise. Since group-wise is more + accurate than per-channel, accuracy should not degrade. + + * ``nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION`` - requires a dataset. The average + Hessian trace of weights with respect to the layer-wise quantization error multiplied + by L2 norm of 8-bit quantization noise. + + * ``nncf.SensitivityMetric.MEAN_ACTIVATION_VARIANCE`` - requires a dataset. The mean + variance of the layers' inputs multiplied by inverted 8-bit quantization noise. + + * ``nncf.SensitivityMetric.MAX_ACTIVATION_VARIANCE`` - requires a dataset. The maximum + variance of the layers' inputs multiplied by inverted 8-bit quantization noise. + + * ``nncf.SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE`` - requires a dataset. The mean + magnitude of the layers' inputs multiplied by inverted 8-bit quantization noise. + +* ``all_layers`` - a boolean parameter that enables INT4 weight quantization of all + Fully-Connected and Embedding layers, including the first and last layers in the model. + +* ``lora_correction`` - a boolean parameter that enables the LoRA Correction Algorithm + to further improve the accuracy of INT4 compressed models on top of other + algorithms - AWQ and Scale Estimation. + +* ``backup_mode`` - defines a backup precision for mixed-precision weight compression. + There are three modes: INT8_ASYM, INT8_SYM, and NONE, which retains + the original floating-point precision of the model weights (``INT8_ASYM`` is default value). + +| + +4-bit Weight Quantization with GPTQ +################################### + +You can use models from Hugging Face +`Transformers `__ library, which are quantized +with `GPTQ `__ algorithm. Such models do not require +additional optimization step because the conversion will automatically preserve +the INT4 optimization results, and model inference will eventually benefit from it. + +See the `example of a model `__ +that has been optimized with GPTQ. + +You can also refer to the code sample below which shows how to load a 4-bit +GPTQ model and run inference. + +.. dropdown:: Using a GPTQ model. + + Make sure to install GPTQ dependencies by running the following command: + + .. code-block:: python + + pip install optimum[openvino] auto-gptq + + .. code-block:: python + + from optimum.intel.openvino import OVModelForCausalLM + from transformers import AutoTokenizer, pipeline + + # Load model from Hugging Face already optimized with GPTQ + model_id = "TheBloke/Llama-2-7B-Chat-GPTQ" + model = OVModelForCausalLM.from_pretrained(model_id, export=True) + + # Inference + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + phrase = "The weather is" + results = pipe(phrase) + print(results) From c4d6d2b37db873e7c657a6049f858cf1cf472cb8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 16 Nov 2024 14:55:20 +0100 Subject: [PATCH 03/53] Update scipy requirement from <1.12,>=1.5.4 to >=1.5.4,<1.15 in /tests (#26735) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the requirements on [scipy](https://github.com/scipy/scipy) to permit the latest version.
Release notes

Sourced from scipy's releases.

SciPy 1.14.1 Release Notes

SciPy 1.14.1 adds support for Python 3.13, including binary wheels on PyPI. Apart from that, it is a bug-fix release with no new features compared to 1.14.0.

Authors

  • Name (commits)
  • h-vetinari (1)
  • Evgeni Burovski (1)
  • CJ Carey (2)
  • Lucas Colley (3)
  • Ralf Gommers (3)
  • Melissa Weber Mendonça (1)
  • Andrew Nelson (3)
  • Nick ODell (1)
  • Tyler Reddy (36)
  • Daniel Schmitz (1)
  • Dan Schult (4)
  • Albert Steppi (2)
  • Ewout ter Hoeven (1)
  • Tibor Völcker (2) +
  • Adam Turner (1) +
  • Warren Weckesser (2)
  • ਗਗਨਦੀਪ ਸਿੰਘ (Gagandeep Singh) (1)

A total of 17 people contributed to this release. People with a "+" by their names contributed a patch for the first time. This list of names is automatically generated, and may not be fully complete.

Commits
  • 92d2a85 REL: 1.14.1 rel commit [wheel build]
  • 85623a1 Merge pull request #21362 from tylerjereddy/treddy_1.14.1_backports
  • d924005 MAINT: PR 21362 revisions [wheel build]
  • b901a4e MAINT, CI: PR 21362 revisions [wheel build]
  • 2a7ec60 MAINT, BLD: PR 21362 revisions [wheel build]
  • f4f084d MAINT, CI: PR 21362 revisions [wheel build]
  • b712fc6 DOC: update 1.14.1 relnotes [wheel build]
  • cdd5aca MAINT: special: Accommodate changed integer handling in NumPy 2.0. (#21401)
  • 0f91838 BLD: cp313 wheels on manylinux_aarch64 (#21409)
  • 6dd0b00 MAINT, CI: wheel build changes [wheel build]
  • Additional commits viewable in compare view

You can trigger a rebase of this PR by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
> **Note** > Automatic rebases have been disabled on this pull request as it has been open for over 30 days. Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- tests/e2e_tests/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_tests/requirements.txt b/tests/e2e_tests/requirements.txt index a4716e496470f4..29e1c1cf31c558 100644 --- a/tests/e2e_tests/requirements.txt +++ b/tests/e2e_tests/requirements.txt @@ -5,7 +5,7 @@ # for common utils py-cpuinfo==9.0.0 -scipy>=1.5.4,<1.12 +scipy>=1.5.4,<1.15 opencv-python>=4.5; sys_platform != "darwin" opencv-python==4.8.1.78; sys_platform == "darwin" unittest-xml-reporting==3.0.4 From 92d65dc4282bad2bc3bccb696d6bede07445a9ca Mon Sep 17 00:00:00 2001 From: Wilson Seok Date: Sun, 17 Nov 2024 22:17:17 -0800 Subject: [PATCH 04/53] [GPU] Consider the node which is shape_of subgraph and ocl_impl in update_shape() (#27551) ### Details: - Consider the node which is shape_of subgraph and ocl_impl in update_shape() ### Tickets: - 154135 --- .../intel_gpu/src/graph/primitive_inst.cpp | 7 +- .../dynamic_execution/update_shape_test.cpp | 68 +++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/unit/dynamic_execution/update_shape_test.cpp diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index c92c5854a8199e..e5276ed678b355 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -392,7 +392,9 @@ void primitive_inst::update_shape() { if (i >= _deps.size()) continue; - if (_deps[i].first->get_node().is_in_shape_of_subgraph()) { + if (_deps[i].first->get_node().is_in_shape_of_subgraph() && + (_deps[i].first->get_node().get_selected_impl() ? _deps[i].first->get_node().get_selected_impl()->is_cpu() + : _deps[i].first->get_node().get_preferred_impl_type() == impl_types::cpu)) { bool can_skip = true; const auto& insts = _deps[i].first->dependant_shape_of_insts; for (auto& inst : insts) { @@ -432,7 +434,8 @@ void primitive_inst::update_shape() { continue; } - if (!get_node().is_type() && !dep->get_node().is_in_shape_of_subgraph()) { + if (!get_node().is_type() && + !(dep->get_node().get_selected_impl() ? dep->get_node().get_selected_impl()->is_cpu() : dep->get_node().get_preferred_impl_type() == impl_types::cpu)) { has_runtime_deps = true; // Events may be not created for in-order queue, so take them for OOO queue only diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/update_shape_test.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/update_shape_test.cpp new file mode 100644 index 00000000000000..ec5042e853fa7a --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/update_shape_test.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include +#include +#include +#include +#include +#include + +#include "program_wrapper.h" + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +namespace update_shape_tests { +TEST(update_shape_test, ocl_impl_in_shapeof_subgraph) { + auto& engine = get_test_engine(); + + layout const1_gather_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx}; + auto const1_gather = engine.allocate_memory(const1_gather_layout); + set_values(const1_gather, {1}); + + layout const_broadcast_layout = layout{ov::PartialShape{}, data_types::i32, format::bfyx}; + auto const_broadcast = engine.allocate_memory(const_broadcast_layout); + set_values(const_broadcast, {1}); + + layout input_l= layout{ov::PartialShape{1, 128}, data_types::i32, format::bfyx}; + auto input_mem = engine.allocate_memory(input_l); + set_values(input_mem, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 6, 7, 8,}); + + auto input_l_dynamic = layout{ov::PartialShape::dynamic(2), data_types::i32, format::bfyx}; + topology topology(input_layout("input", input_l_dynamic), + data("const1_gather", const1_gather), + data("const_broadcast", const_broadcast), + shape_of("shape_of", input_info("input"), data_types::i32), + gather("gather", input_info("shape_of"), input_info("const1_gather"), 0, 1, ov::Shape({1})), + broadcast("broadcast1", input_info("const_broadcast"), input_info("gather"), {}, ov::op::BroadcastType::NUMPY), + count_nonzero("count_nonzero", input_info("broadcast1")), + gather_nonzero("gather_nonzero", input_info("broadcast1"), input_info("count_nonzero")), + broadcast("broadcast2", input_info("gather_nonzero"), input_info("shape_of"), {}, ov::op::BroadcastType::BIDIRECTIONAL)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + std::map outputs; + OV_ASSERT_NO_THROW(outputs = network.execute()); +} +} // update_shape_test From 6733cc320915ca6bfad9036940bf5ca244b41a8b Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Mon, 18 Nov 2024 11:35:14 +0400 Subject: [PATCH 05/53] [TF FE] Stabilize tf.keras.layers.Conv2DTranspose layer test on all platforms (#27578) **Details:** Stabilize tf.keras.layers.Conv2DTranspose layer test on all platforms **Ticket:** 155121 Signed-off-by: Kazantsev, Roman --- .../test_tf2_keras_conv_2d_transpose.py | 88 ++++++++++--------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_conv_2d_transpose.py b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_conv_2d_transpose.py index 1226c373b24fe7..d58aa72d8d0d78 100644 --- a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_conv_2d_transpose.py +++ b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_conv_2d_transpose.py @@ -1,69 +1,75 @@ # Copyright (C) 2022-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import numpy as np import pytest import tensorflow as tf from common.tf2_layer_test_class import CommonTF2LayerTest +rng = np.random.default_rng(233534) + class TestKerasConv2DTranspose(CommonTF2LayerTest): - def create_keras_conv_2d_transpose_net(self, conv_params, input_names, input_shapes, input_type, - ir_version): + def _prepare_input(self, inputs_info): + assert 'x' in inputs_info, "Test error: inputs_info must contain `x`" + x_shape = inputs_info['x'] + inputs_data = {} + inputs_data['x'] = rng.uniform(-2.0, 2.0, x_shape).astype(self.input_type) + return inputs_data + + def create_keras_conv_2d_transpose_net(self, input_shapes, input_type, + filters, kernel_size, + strides, padding, data_format, + dilation_rate, activation, + use_bias): + self.input_type = input_type activation_func_structure = { # pytest-xdist can't execute the tests in parallel because workers can't compare tests scopes before run # tf.nn. operation have no "==" operation to be compared - "relu": tf.nn.relu, - "sigmoid": tf.nn.sigmoid + 'relu': tf.nn.relu, + 'sigmoid': tf.nn.sigmoid } - conv_params = conv_params.copy() - if "activation" in conv_params: - conv_params["activation"] = activation_func_structure[conv_params["activation"]] + activation = activation_func_structure[activation] # create TensorFlow 2 model with Keras Conv2DTranspose operation - tf.keras.backend.clear_session() # For easy reset of notebook state - x = tf.keras.Input(shape=input_shapes[0][1:], dtype=input_type, - name=input_names[0]) # Variable-length sequence of ints + tf.keras.backend.clear_session() + x = tf.keras.Input(shape=input_shapes[0][1:], dtype=input_type, name='x') - y = tf.keras.layers.Conv2DTranspose(**conv_params, input_shape=input_shapes[0][1:])(x) + y = tf.keras.layers.Conv2DTranspose(filters=filters, kernel_size=kernel_size, + strides=strides, padding=padding, data_format=data_format, + dilation_rate=dilation_rate, activation=activation, + use_bias=use_bias)(x) tf2_net = tf.keras.Model(inputs=[x], outputs=[y]) - # TODO: add reference IR net. Now it is omitted and tests only inference result that is more important ref_net = None return tf2_net, ref_net - test_data_float32 = [ - dict(conv_params=dict(filters=27, kernel_size=3, padding="valid", strides=(2, 2), - data_format="channels_last"), input_names=["x"], - input_shapes=[[3, 5, 7, 6]], input_type=tf.float32), - dict(conv_params=dict(filters=10, kernel_size=5, padding="same", strides=(7, 7), - activation="relu", use_bias=True, output_padding=(3, 3)), - input_names=["x"], input_shapes=[[3, 5, 7, 8]], input_type=tf.float32), - dict(conv_params=dict(filters=10, kernel_size=5, padding="same", strides=(7, 7), - output_padding=(5, 5)), - input_names=["x"], input_shapes=[[3, 5, 7, 8]], input_type=tf.float32), - dict(conv_params=dict(filters=27, kernel_size=3, padding="valid", dilation_rate=1), - input_names=["x"], - input_shapes=[[3, 9, 7, 6]], input_type=tf.float32), - dict(conv_params=dict(filters=10, kernel_size=5, padding="same", dilation_rate=1), - input_names=["x"], - input_shapes=[[3, 9, 7, 8]], input_type=tf.float32), - dict(conv_params=dict(filters=27, kernel_size=3, padding="valid", dilation_rate=1, - activation="sigmoid", - use_bias=False), input_names=["x"], input_shapes=[[3, 9, 7, 6]], - input_type=tf.float32), - dict(conv_params=dict(filters=10, kernel_size=5, padding="same", dilation_rate=1, - use_bias=True), - input_names=["x"], input_shapes=[[3, 9, 7, 8]], input_type=tf.float32) - ] - - @pytest.mark.parametrize("params", test_data_float32) + @pytest.mark.parametrize('input_shapes', [[[3, 9, 7, 8]]]) + @pytest.mark.parametrize('input_type', [np.float32, np.float64]) + @pytest.mark.parametrize('filters', [2, 5]) + @pytest.mark.parametrize('kernel_size', [3, 5]) + @pytest.mark.parametrize('strides', [(1, 2), (2, 2)]) + @pytest.mark.parametrize('padding', ['valid', 'same']) + @pytest.mark.parametrize('data_format', ['channels_last']) + @pytest.mark.parametrize('dilation_rate', [(1, 1)]) + @pytest.mark.parametrize('activation', ['sigmoid', 'relu']) + @pytest.mark.parametrize('use_bias', [True, False]) @pytest.mark.precommit @pytest.mark.nightly - def test_keras_conv_2d_transpose_float32(self, params, ie_device, precision, ir_version, - temp_dir, use_legacy_frontend): - self._test(*self.create_keras_conv_2d_transpose_net(**params, ir_version=ir_version), + def test_keras_conv_2d_transpose(self, input_shapes, input_type, filters, kernel_size, + strides, padding, data_format, dilation_rate, activation, + use_bias, + ie_device, precision, + ir_version, temp_dir, use_legacy_frontend): + params = {} + params['input_shapes'] = input_shapes + self._test(*self.create_keras_conv_2d_transpose_net(input_shapes, input_type, + filters, kernel_size, + strides, padding, data_format, + dilation_rate, activation, + use_bias), ie_device, precision, temp_dir=temp_dir, ir_version=ir_version, use_legacy_frontend=use_legacy_frontend, **params) From 5b43849156a8028c18b26abdac600bbb58386739 Mon Sep 17 00:00:00 2001 From: Karol Blaszczak Date: Mon, 18 Nov 2024 10:19:38 +0100 Subject: [PATCH 06/53] [DOCS] aipc model table update mstr (#27586) --- .../generative-ai-performance.rst | 34 +++- .../benchmarks_files/llm_models_7-155H.csv | 156 +++++++++++++++ .../benchmarks_files/llm_models_7-258V.csv | 182 ++++++++++++++++++ .../{llm_models.csv => llm_models_9-288V.csv} | 0 4 files changed, 363 insertions(+), 9 deletions(-) create mode 100644 docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv create mode 100644 docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv rename docs/sphinx_setup/_static/benchmarks_files/{llm_models.csv => llm_models_9-288V.csv} (100%) diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index b8256af650e2f8..5697fcbf6e4d74 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -5,9 +5,7 @@ This page is regularly updated to help you identify the best-performing LLMs on Intel® Core™ Ultra processor family and AI PCs. The current data is as of OpenVINO 2024.4, 24 Oct. 2024 -The tables below list the key performance indicators for a selection of Large Language Models, -running on an Intel® Core™ Ultra 7-165H, Intel® Core™ Ultra 7-265V, and Intel® Core™ Ultra -7-288V based system, on built-in GPUs. +The tables below list the key performance indicators for inference on built-in GPUs. @@ -16,14 +14,32 @@ running on an Intel® Core™ Ultra 7-165H, Intel® Core™ Ultra 7-265V, and In +.. tab-set:: -.. csv-table:: - :class: modeldata stripe - :name: supportedModelsTableOv - :header-rows: 1 - :file: ../../_static/benchmarks_files/llm_models.csv + .. tab-item:: 9-288V + + .. csv-table:: + :class: modeldata stripe + :name: supportedModelsTableOv + :header-rows: 1 + :file: ../../_static/benchmarks_files/llm_models_9-288V.csv + + .. tab-item:: 7-268V + + .. csv-table:: + :class: modeldata stripe + :name: supportedModelsTableOv + :header-rows: 1 + :file: ../../_static/benchmarks_files/llm_models_7-258V.csv + + .. tab-item:: 7-155H + + .. csv-table:: + :class: modeldata stripe + :name: supportedModelsTableOv + :header-rows: 1 + :file: ../../_static/benchmarks_files/llm_models_7-155H.csv -| .. grid:: 1 1 2 2 :gutter: 4 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv new file mode 100644 index 00000000000000..d2c68a3619620e --- /dev/null +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv @@ -0,0 +1,156 @@ +Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec,,, +opt-125m-gptq,INT4-MIXED,32,965.9,29,7.7,129.87,,, +opt-125m-gptq,INT4-MIXED,1024,1507.9,113.1,7.8,128.21,,, +tiny-llama-1.1b-chat,INT4-MIXED,32,1831.8,46.5,16.7,59.88,,, +tiny-llama-1.1b-chat,INT4-MIXED,1024,1806.3,635,17.8,56.18,,, +qwen2-0.5b,INT4-MIXED,32,2551.7,61.4,18.3,54.64,,, +qwen2-0.5b,INT4-MIXED,1024,2976.6,356.1,19.2,52.08,,, +tiny-llama-1.1b-chat,INT8-CW,32,1987.4,56,21.6,46.30,,, +tiny-llama-1.1b-chat,INT8-CW,1024,2209.1,772.7,22.6,44.25,,, +qwen2-0.5b,INT8-CW,32,2484.9,57.3,22.8,43.86,,, +qwen2-0.5b,INT8-CW,1024,3102.5,407.1,23.9,41.84,,, +qwen2-1.5b,INT4-MIXED,32,4265.2,71.7,25.5,39.22,,, +qwen2-1.5b,INT4-MIXED,1024,4884.5,862.4,26.8,37.31,,, +dolly-v2-3b,INT4-MIXED,32,2401.3,89.6,27.5,36.36,,, +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2511.5,78.6,28.2,35.46,,, +phi-2,INT4-MIXED,32,2279.5,95.7,29.1,34.36,,, +minicpm-1b-sft,INT4-MIXED,31,2759.9,104.4,30.9,32.36,,, +phi-2,INT4-MIXED,32,2620.1,100.8,31,32.26,,, +stable-zephyr-3b-dpo,INT4-MIXED,30,2636.5,86.8,31.7,31.55,,, +dolly-v2-3b,INT4-MIXED,1024,3137.1,1782.9,32.2,31.06,,, +red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,3118.5,1831.7,33.3,30.03,,, +red-pajama-incite-chat-3b-v1,INT4-MIXED,1024,2862.7,1821.1,33.5,29.85,,, +qwen2-1.5b,INT8-CW,32,4831.2,87,33.8,29.59,,, +opt-2.7b,INT4-MIXED,31,2898.3,73.2,33.9,29.50,,, +phi-2,INT4-MIXED,1024,2797.4,1887,34,29.41,,, +orca-mini-3b,INT4-MIXED,32,2877.8,100.3,35,28.57,,, +stablelm-3b-4e1t,INT4-MIXED,32,2669.4,94.7,35.3,28.33,,, +qwen2-1.5b,INT8-CW,1024,5455.8,1047.6,35.3,28.33,,, +minicpm-1b-sft,INT8-CW,31,3104.1,103.5,35.3,28.33,,, +phi-2,INT4-MIXED,1024,3039.8,1917.4,35.9,27.86,,, +stable-zephyr-3b-dpo,INT4-MIXED,946,3411.4,1695,37,27.03,,, +gemma-2b-it,INT4-MIXED,32,3991.7,116.1,37.9,26.39,,, +opt-2.7b,INT4-MIXED,937,3617.5,1764.9,38.2,26.18,,, +phi-3-mini-4k-instruct,INT4-MIXED,31,2935.3,111.6,38.2,26.18,,, +phi-3-mini-4k-instruct,INT4-MIXED,38,3102.4,134,38.4,26.04,,, +phi-3-mini-4k-instruct,INT4-MIXED,31,2986.1,114.1,38.9,25.71,,, +phi-3-mini-4k-instruct,INT4-MIXED,38,2977.4,131.1,39,25.64,,, +gemma-2b-it,INT4-MIXED,1024,4973.3,1249.2,39.7,25.19,,, +stablelm-3b-4e1t,INT4-MIXED,1024,3196.9,2045.4,39.9,25.06,,, +dolly-v2-3b,INT8-CW,32,3490.2,107.4,41.5,24.10,,, +red-pajama-incite-chat-3b-v1,INT8-CW,32,3457.9,105,42.5,23.53,,, +opt-2.7b,INT8-CW,31,3686.8,107.5,44.1,22.68,,, +phi-2,INT8-CW,32,3554.9,116.6,44.1,22.68,,, +phi-3-mini-4k-instruct,INT4-MIXED,1023,3390.7,2277.1,44.2,22.62,,, +phi-3-mini-4k-instruct,INT4-MIXED,1061,3643.6,2485,44.4,22.52,,, +phi-3-mini-4k-instruct,INT4-MIXED,1023,3516.4,2280.9,44.5,22.47,,, +phi-3-mini-4k-instruct,INT4-MIXED,1061,3537.2,2522.4,44.7,22.37,,, +orca-mini-3b,INT4-MIXED,1024,3557.3,1898.9,45,22.22,,, +minicpm-1b-sft,FP16,31,3814.4,97.9,45.4,22.03,,, +stablelm-3b-4e1t,INT8-CW,32,3486.9,100.5,46.1,21.69,,, +stable-zephyr-3b-dpo,INT8-CW,30,3516.7,101.9,46.1,21.69,,, +dolly-v2-3b,INT8-CW,1024,4265.9,2178.6,46.2,21.65,,, +red-pajama-incite-chat-3b-v1,INT8-CW,1020,3979.1,2219.7,47.2,21.19,,, +red-pajama-incite-chat-3b-v1,INT8-CW,1024,3975.5,2199.7,47.3,21.14,,, +opt-2.7b,INT8-CW,937,4358.6,1981.8,48.4,20.66,,, +phi-2,INT8-CW,1024,4058.1,2280.1,48.9,20.45,,, +gemma-2b-it,INT8-CW,32,4786.8,119.8,49.4,20.24,,, +chatglm3-6b,INT4-MIXED,32,4141.5,166.6,49.7,20.12,,, +stablelm-3b-4e1t,INT8-CW,1024,4054.8,2243.5,50.7,19.72,,, +stable-zephyr-3b-dpo,INT8-CW,946,4521.8,1816.4,51.3,19.49,,, +gemma-2b-it,INT8-CW,1024,5810.7,1580,51.3,19.49,,, +chatglm3-6b,INT4-MIXED,32,4651.4,164.7,51.6,19.38,,, +chatglm3-6b,INT4-MIXED,1024,4235.1,2818.7,52.3,19.12,,, +orca-mini-3b,INT8-CW,32,4162,109.2,53.3,18.76,,, +chatglm3-6b,INT4-MIXED,1024,4783.8,2869,54.4,18.38,,, +gpt-j-6b,INT4-MIXED,32,4667.3,176.7,56.3,17.76,,, +chatglm3-6b-gptq,INT4-MIXED,32,5369.4,173.9,58.9,16.98,,, +llama-2-7b-chat-hf,INT4-MIXED,32,4280,173.2,60.1,16.64,,, +phi-3-mini-4k-instruct,INT8-CW,31,4585.1,123,60.5,16.53,,, +phi-3-mini-4k-instruct,INT8-CW,38,4597,152,60.5,16.53,,, +chatglm2-6b,INT4-MIXED,32,4847.8,158.7,60.6,16.50,,, +vicuna-7b-v1.5,INT4-MIXED,32,4476.9,178.2,61.2,16.34,,, +chatglm3-6b-gptq,INT4-MIXED,1024,5217.6,2863.7,61.3,16.31,,, +mistral-7b-v0.1,INT4-MIXED,31,4413.6,194,61.7,16.21,,, +qwen2-7b,INT4-MIXED,32,7044.7,184.4,61.7,16.21,,, +mistral-7b-v0.1,INT4-MIXED,32,4427.6,193.3,61.8,16.18,,, +orca-mini-3b,INT8-CW,1024,4821.6,2239.1,62,16.13,,, +codegen25-7b,INT4-MIXED,32,4687.2,176.2,62.7,15.95,,, +chatglm2-6b,INT4-MIXED,1024,5165.9,3148,63,15.87,,, +llama-2-7b-gptq,INT4-MIXED,32,4632.8,175.2,63.4,15.77,,, +stablelm-7b,INT4-MIXED,32,5219.5,206.3,63.4,15.77,,, +qwen-7b-chat,INT4-MIXED,32,7805.6,193.8,63.6,15.72,,, +gpt-j-6b,INT4-MIXED,1024,5314.9,3111.8,63.6,15.72,,, +qwen2-7b,INT4-MIXED,1024,7716.2,3548.3,64.1,15.60,,, +llama-3-8b,INT4-MIXED,32,4910.9,204.8,64.7,15.46,,, +mistral-7b-v0.1,INT4-MIXED,1024,4720.8,3667.1,64.8,15.43,,, +mistral-7b-v0.1,INT4-MIXED,1007,4704.7,3685.4,64.9,15.41,,, +llama-3.1-8b,INT4-MIXED,31,4850.3,211.5,64.9,15.41,,, +phi-3-mini-4k-instruct,INT8-CW,1023,5128.6,2815.2,65.7,15.22,,, +phi-3-mini-4k-instruct,INT8-CW,1061,5155,3407.9,65.9,15.17,,, +mistral-7b-v0.1,INT4-MIXED,32,4939.3,192,66.5,15.04,,, +llama-3-8b,INT4-MIXED,33,4919.4,261.9,67.2,14.88,,, +llama-2-7b-chat-hf,INT4-MIXED,1024,4948.2,3811,67.3,14.86,,, +qwen1.5-7b-chat,INT4-MIXED,32,5943.1,180.5,67.7,14.77,,, +qwen-7b-chat-gptq,INT4-MIXED,32,8057,187,68.1,14.68,,, +llama-3-8b,INT4-MIXED,32,5503.5,198.4,68.1,14.68,,, +qwen-7b-chat,INT4-MIXED,32,8091.6,185.9,68.1,14.68,,, +llama-3-8b,INT4-MIXED,1024,5569.1,3920.5,68.2,14.66,,, +llama-3.1-8b,INT4-MIXED,31,5358.6,201,68.2,14.66,,, +stablelm-7b,INT4-MIXED,1020,5804.4,3726.6,68.8,14.53,,, +llama-3.1-8b,INT4-MIXED,31,5452.6,202.9,68.8,14.53,,, +llama-2-7b-chat-hf,INT4-MIXED,32,5023,165.7,69,14.49,,, +llama-3-8b,INT4-MIXED,32,5413.6,202,69.1,14.47,,, +llama-3-8b,INT4-MIXED,33,5440.4,262.1,69.2,14.45,,, +codegen25-7b,INT4-MIXED,1024,5434.6,3513.2,69.9,14.31,,, +mistral-7b-v0.1,INT4-MIXED,1024,5614.9,3819.1,70,14.29,,, +mistral-7b-v0.1,INT4-MIXED,31,4927.8,205,70.5,14.18,,, +llama-3-8b,INT4-MIXED,33,5498.9,270.7,70.6,14.16,,, +llama-3-8b,INT4-MIXED,1025,5577.4,4271.2,70.6,14.16,,, +llama-2-7b-gptq,INT4-MIXED,1024,5302.2,3529.4,70.7,14.14,,, +zephyr-7b-beta,INT4-MIXED,32,5212.4,190.6,71.2,14.04,,, +llama-3-8b,INT4-MIXED,1024,6161.1,3918,71.5,13.99,,, +llama-3-8b,INT4-MIXED,1025,6098,4441.8,72.3,13.83,,, +llama-3-8b,INT4-MIXED,1024,6071.7,3972.2,72.4,13.81,,, +mistral-7b-v0.1,INT4-MIXED,1007,5224.1,4153.4,73.8,13.55,,, +llama-3-8b,INT4-MIXED,1025,6156.9,4357,73.9,13.53,,, +zephyr-7b-beta,INT4-MIXED,1024,5511.6,3978,74.4,13.44,,, +opt-2.7b,FP16,31,9220.3,107.8,74.7,13.39,,, +dolly-v2-3b,FP16,32,6058.9,109.9,74.7,13.39,,, +qwen1.5-7b-chat,INT4-MIXED,1024,7063.2,3791.7,75,13.33,,, +qwen-7b-chat,INT4-MIXED,1024,8919.5,3763.9,75,13.33,,, +red-pajama-incite-chat-3b-v1,FP16,32,6036.5,107.5,75.9,13.18,,, +llama-2-7b-chat-hf,INT4-MIXED,1024,5716.8,4231.7,76.2,13.12,,, +phi-2,FP16,32,6090.1,115.2,77.1,12.97,,, +stable-zephyr-3b-dpo,FP16,30,6113.1,112.1,78.6,12.72,,, +qwen-7b-chat,INT4-MIXED,1024,9212.9,3857.4,78.6,12.72,,, +stablelm-3b-4e1t,FP16,32,6065.4,110.2,78.7,12.71,,, +opt-2.7b,FP16,937,9733.8,3750.8,78.8,12.69,,, +dolly-v2-3b,FP16,1024,6615.2,2230.9,79.1,12.64,,, +red-pajama-incite-chat-3b-v1,FP16,1020,6588.3,2259.4,80.2,12.47,,, +glm-4-9b,INT4-MIXED,33,6386.2,328,80.4,12.44,,, +red-pajama-incite-chat-3b-v1,FP16,1024,6570.3,2268.7,80.4,12.44,,, +baichuan2-7b-chat,INT4-MIXED,32,5977.9,201.7,81,12.35,,, +glm-4-9b,INT4-MIXED,32,6389.7,248.1,81,12.35,,, +phi-2,FP16,1024,6646.2,2406.7,81.4,12.29,,, +stable-zephyr-3b-dpo,FP16,946,6875.7,1868.2,82.9,12.06,,, +stablelm-3b-4e1t,FP16,1024,6636.1,2036.9,83,12.05,,, +chatglm2-6b,INT8-CW,32,6731.8,159.2,84.4,11.85,,, +glm-4-9b,INT4-MIXED,1025,7061.4,4939.2,85.2,11.74,,, +qwen-7b-chat-gptq,INT4-MIXED,1024,9175.3,3898,85.3,11.72,,, +gemma-7b-it,INT4-MIXED,32,7883.9,230.5,86,11.63,,, +gemma-7b-it,INT4-MIXED,32,8002.6,235,86.1,11.61,,, +glm-4-9b,INT4-MIXED,1024,7064.9,4411.2,86.2,11.60,,, +gpt-j-6b,INT8-CW,32,7009.2,176.8,86.4,11.57,,, +chatglm2-6b,INT8-CW,1024,7050.5,3871.6,86.8,11.52,,, +chatglm3-6b,INT8-CW,32,6755.9,159,86.8,11.52,,, +baichuan2-7b-chat,INT4-MIXED,1024,7033.3,4049,88.8,11.26,,, +chatglm3-6b,INT8-CW,1024,7076.5,3865.9,89.2,11.21,,, +qwen-7b-chat,INT4-MIXED,32,9245.7,176.3,90,11.11,,, +gemma-7b-it,INT4-MIXED,1024,9449.4,4305.8,93.2,10.73,,, +gpt-j-6b,INT8-CW,1024,7672.3,4181.1,93.5,10.70,,, +gemma-7b-it,INT4-MIXED,1024,9330.5,4222.5,93.7,10.67,,, +orca-mini-3b,FP16,32,7416.5,122.3,94.7,10.56,,, +codegen25-7b,INT8-CW,32,7557.6,170.7,98.4,10.16,,, +qwen-7b-chat,INT4-MIXED,1024,10371.1,4271.7,98.9,10.11,,, +llama-2-7b-chat-hf,INT8-CW,32,7390.6,171.6,99.9,10.01,,, diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv new file mode 100644 index 00000000000000..efbf0cee8e4a80 --- /dev/null +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv @@ -0,0 +1,182 @@ +Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec,,, +opt-125m-gptq,INT4-MIXED,1024,1513.6,81.9,7.8,128.21,,, +opt-125m-gptq,INT4-MIXED,32,979.9,50.4,7.9,126.58,,, +tiny-llama-1.1b-chat,INT4-MIXED,1024,1943.3,176.3,16.8,59.52,,, +tiny-llama-1.1b-chat,INT4-MIXED,32,1982.2,59.5,17.1,58.48,,, +qwen2-0.5b,INT4-MIXED,32,2678,117.3,18.7,53.48,,, +tiny-llama-1.1b-chat,INT8-CW,32,2080.9,59.4,19,52.63,,, +qwen2-0.5b,INT4-MIXED,1024,3036.1,165.5,19.2,52.08,,, +tiny-llama-1.1b-chat,INT8-CW,1024,2287,241.4,19.6,51.02,,, +qwen2-0.5b,INT8-CW,1024,3084.9,172.1,20,50.00,,, +qwen2-0.5b,INT8-CW,32,2518,105.5,21.4,46.73,,, +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2793.6,141.8,23.9,41.84,,, +qwen2-1.5b,INT4-MIXED,32,4515.4,118.7,24,41.67,,, +qwen2-1.5b,INT4-MIXED,1024,4930.1,229.6,24.3,41.15,,, +dolly-v2-3b,INT4-MIXED,32,2486.1,174,25.4,39.37,,, +phi-2,INT4-MIXED,32,2552.9,210.6,26.9,37.17,,, +red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,2934.1,464.5,27.5,36.36,,, +qwen2-1.5b,INT8-CW,32,4813.4,119.1,27.8,35.97,,, +opt-2.7b,INT4-MIXED,31,3172.5,131.9,28.5,35.09,,, +red-pajama-incite-chat-3b-v1,INT4-MIXED,1024,3038.2,447.1,28.6,34.97,,, +dolly-v2-3b,INT4-MIXED,1024,2947.4,409,28.8,34.72,,, +qwen2-1.5b,INT8-CW,1024,5394.8,327.9,29.3,34.13,,, +stable-zephyr-3b-dpo,INT4-MIXED,30,2728.1,131.2,29.8,33.56,,, +phi-2,INT4-MIXED,32,2805.1,208.3,30.2,33.11,,, +minicpm-1b-sft,INT8-CW,31,3104.2,147.8,30.9,32.36,,, +phi-2,INT4-MIXED,1024,3058.9,602.9,31.1,32.15,,, +minicpm-1b-sft,INT4-MIXED,31,2970.1,183.7,31.1,32.15,,, +stablelm-3b-4e1t,INT4-MIXED,32,3077.1,183.2,31.6,31.65,,, +opt-2.7b,INT4-MIXED,937,3416.7,429.4,31.6,31.65,,, +stable-zephyr-3b-dpo,INT4-MIXED,946,3211.8,428.8,32.3,30.96,,, +phi-3-mini-4k-instruct,INT4-MIXED,31,3014.5,116,32.5,30.77,,, +phi-3-mini-4k-instruct,INT4-MIXED,38,2957.4,153.9,32.5,30.77,,, +phi-2,INT4-MIXED,1024,3278.9,613.3,33.4,29.94,,, +phi-3-mini-4k-instruct,INT4-MIXED,38,3288.5,152.9,33.4,29.94,,, +phi-3-mini-4k-instruct,INT4-MIXED,31,3265.1,123.6,34.1,29.33,,, +gemma-2b-it,INT4-MIXED,32,4162.1,208.8,34.2,29.24,,, +stablelm-3b-4e1t,INT4-MIXED,1024,3525.8,524.5,35,28.57,,, +phi-3-mini-4k-instruct,INT4-MIXED,1061,3427.8,777.5,36.5,27.40,,, +phi-3-mini-4k-instruct,INT4-MIXED,1023,3405.4,554.1,36.7,27.25,,, +gemma-2b-it,INT4-MIXED,1024,5053.1,354.8,36.9,27.10,,, +minicpm-1b-sft,FP16,31,3595.5,124.9,36.9,27.10,,, +phi-3-mini-4k-instruct,INT4-MIXED,1061,3547.2,755.8,37.1,26.95,,, +phi-3-mini-4k-instruct,INT4-MIXED,1023,3528.4,536.4,37.4,26.74,,, +red-pajama-incite-chat-3b-v1,INT8-CW,32,3747.7,189.9,38.1,26.25,,, +opt-2.7b,INT8-CW,31,3810.7,145.7,38.5,25.97,,, +chatglm3-6b,INT4-MIXED,32,4120.7,67.3,38.7,25.84,,, +dolly-v2-3b,INT8-CW,32,3747,188.4,39.2,25.51,,, +chatglm3-6b,INT4-MIXED,32,4482.9,69.9,40.7,24.57,,, +chatglm3-6b,INT4-MIXED,1024,4146,606.8,41,24.39,,, +opt-2.7b,INT8-CW,937,4458.9,587.8,41.8,23.92,,, +red-pajama-incite-chat-3b-v1,INT8-CW,1024,4088.4,634.1,41.9,23.87,,, +red-pajama-incite-chat-3b-v1,INT8-CW,1020,4086.8,653.4,42,23.81,,, +phi-2,INT8-CW,32,3794.6,202.7,42.1,23.75,,, +chatglm3-6b,INT4-MIXED,1024,4446.7,598.6,42.3,23.64,,, +stablelm-3b-4e1t,INT8-CW,32,3652.5,146,42.6,23.47,,, +stable-zephyr-3b-dpo,INT8-CW,30,3768.6,151.9,42.6,23.47,,, +dolly-v2-3b,INT8-CW,1024,4092,603.1,42.9,23.31,,, +stablelm-3b-4e1t,INT8-CW,1024,4143.2,671.7,45.2,22.12,,, +gemma-2b-it,INT8-CW,32,4878.4,221.6,45.6,21.93,,, +phi-2,INT8-CW,1024,4153.6,810.3,46,21.74,,, +llama-2-7b-chat-hf,INT4-MIXED,32,4394.6,109.7,46.2,21.65,,, +chatglm3-6b-gptq,INT4-MIXED,32,5218.9,79.7,46.7,21.41,,, +stable-zephyr-3b-dpo,INT8-CW,946,4360.1,627.8,46.8,21.37,,, +vicuna-7b-v1.5,INT4-MIXED,32,4482.3,101.2,47.2,21.19,,, +gemma-2b-it,INT8-CW,1024,5837.1,507.1,48,20.83,,, +llama-2-7b-gptq,INT4-MIXED,32,4734.3,102.8,48.1,20.79,,, +orca-mini-3b,INT4-MIXED,32,2720.1,132,48.1,20.79,,, +qwen-7b-chat,INT4-MIXED,32,7803.7,178.5,48.3,20.70,,, +mistral-7b-v0.1,INT4-MIXED,31,4537.5,99,48.5,20.62,,, +codegen25-7b,INT4-MIXED,32,4723.3,108.5,48.5,20.62,,, +chatglm3-6b-gptq,INT4-MIXED,1024,5150.8,614.2,48.8,20.49,,, +mistral-7b-v0.1,INT4-MIXED,32,4572,102.9,48.8,20.49,,, +llama-3-8b,INT4-MIXED,33,4991.2,252.2,50.9,19.65,,, +qwen-7b-chat-gptq,INT4-MIXED,32,8088.4,212.6,51,19.61,,, +chatglm2-6b,INT4-MIXED,32,4960.6,105.5,51.2,19.53,,, +gpt-j-6b,INT4-MIXED,32,4699.5,259.2,51.4,19.46,,, +llama-3.1-8b,INT4-MIXED,31,4897.8,106.9,51.5,19.42,,, +llama-3-8b,INT4-MIXED,32,4999.7,105.9,51.6,19.38,,, +qwen-7b-chat,INT4-MIXED,32,8085.9,193.5,51.7,19.34,,, +falcon-7b-instruct,INT4-MIXED,32,5416.2,175,52.5,19.05,,, +mistral-7b-v0.1,INT4-MIXED,1007,4772.6,803,52.6,19.01,,, +qwen1.5-7b-chat,INT4-MIXED,32,6027.3,174.9,53,18.87,,, +mistral-7b-v0.1,INT4-MIXED,1024,4775,717.6,53,18.87,,, +llama-2-7b-chat-hf,INT4-MIXED,1024,4976.5,992.1,53.1,18.83,,, +qwen2-7b,INT4-MIXED,32,7087.1,138.1,53.3,18.76,,, +llama-2-7b-gptq,INT4-MIXED,1024,5351.2,711.6,53.7,18.62,,, +llama-3-8b,INT4-MIXED,32,5472.8,109.4,53.7,18.62,,, +phi-3-mini-4k-instruct,INT8-CW,38,4575.3,115.9,53.7,18.62,,, +stablelm-7b,INT4-MIXED,32,5213.7,128.5,53.8,18.59,,, +phi-3-mini-4k-instruct,INT8-CW,31,4571.8,118.9,53.8,18.59,,, +llama-3-8b,INT4-MIXED,33,5480.4,246.8,53.9,18.55,,, +llama-3-8b,INT4-MIXED,32,5528.2,144.9,54.3,18.42,,, +llama-3.1-8b,INT4-MIXED,31,5377.3,112.8,54.3,18.42,,, +chatglm2-6b,INT4-MIXED,1024,5232.3,759.6,54.6,18.32,,, +llama-3.1-8b,INT4-MIXED,31,5440.4,126.4,54.8,18.25,,, +llama-3-8b,INT4-MIXED,33,5532.8,248.2,54.9,18.21,,, +codegen25-7b,INT4-MIXED,1024,5412.9,714.8,55,18.18,,, +mistral-7b-v0.1,INT4-MIXED,32,4998.5,117.3,55.2,18.12,,, +mistral-7b-v0.1,INT4-MIXED,31,5000.2,122.4,55.6,17.99,,, +llama-3-8b,INT4-MIXED,1024,5594,953.5,56.6,17.67,,, +gpt-j-6b,INT4-MIXED,1024,5323.8,1254,56.8,17.61,,, +llama-3-8b,INT4-MIXED,1025,5596.7,1192.3,56.8,17.61,,, +qwen2-7b,INT4-MIXED,1024,7722.1,714.2,57,17.54,,, +phi-3-mini-4k-instruct,INT8-CW,1023,5067.1,818.5,57.4,17.42,,, +phi-3-mini-4k-instruct,INT8-CW,1061,5086.1,975.1,57.4,17.42,,, +llama-2-7b-chat-hf,INT4-MIXED,32,5087.7,126.2,57.9,17.27,,, +stablelm-7b,INT4-MIXED,1020,5780.5,1248.4,59,16.95,,, +llama-3-8b,INT4-MIXED,1025,6088.9,1381.5,59,16.95,,, +llama-3-8b,INT4-MIXED,1024,6084.8,931.2,59.2,16.89,,, +llama-3-8b,INT4-MIXED,1025,6141.2,1494.3,59.4,16.84,,, +llama-3-8b,INT4-MIXED,1024,6133.8,1075.2,59.6,16.78,,, +mistral-7b-v0.1,INT4-MIXED,1024,5472.6,794.3,59.7,16.75,,, +zephyr-7b-beta,INT4-MIXED,32,5328.5,103.5,59.8,16.72,,, +falcon-7b-instruct,INT4-MIXED,1024,5677.5,686.2,59.8,16.72,,, +mistral-7b-v0.1,INT4-MIXED,1007,5243.5,1074,59.9,16.69,,, +qwen1.5-7b-chat,INT4-MIXED,1024,7096.7,1132.7,60,16.67,,, +qwen-7b-chat,INT4-MIXED,1024,8872.6,792.8,61,16.39,,, +qwen-7b-chat,INT4-MIXED,1024,9164.4,822.6,63.3,15.80,,, +orca-mini-3b,INT8-CW,32,4221.7,170.6,63.5,15.75,,, +llama-2-7b-chat-hf,INT4-MIXED,1024,5708.1,1397.9,63.6,15.72,,, +glm-4-9b,INT4-MIXED,33,6402.9,307.1,63.8,15.67,,, +zephyr-7b-beta,INT4-MIXED,1024,5572.4,1156.4,64.3,15.55,,, +glm-4-9b,INT4-MIXED,32,6383.1,256.2,64.5,15.50,,, +baichuan2-7b-chat,INT4-MIXED,32,5926.3,191.8,65.8,15.20,,, +opt-2.7b,FP16,31,5886,112.2,68,14.71,,, +dolly-v2-3b,FP16,32,6161.5,147.5,69.5,14.39,,, +red-pajama-incite-chat-3b-v1,FP16,32,6265.4,146.2,69.6,14.37,,, +glm-4-9b,INT4-MIXED,1024,6994.5,1013.7,69.8,14.33,,, +opt-2.7b,FP16,937,6345,379.5,71.6,13.97,,, +glm-4-9b,INT4-MIXED,1025,7014.9,1416.8,72.5,13.79,,, +phi-2,FP16,32,6204.7,189.2,72.9,13.72,,, +stable-zephyr-3b-dpo,FP16,30,6221.4,159.7,73,13.70,,, +dolly-v2-3b,FP16,1024,6669.9,424.3,73.3,13.64,,, +red-pajama-incite-chat-3b-v1,FP16,1020,6658.8,484.7,73.4,13.62,,, +stablelm-3b-4e1t,FP16,32,6216.3,145.4,73.5,13.61,,, +qwen-7b-chat,INT4-MIXED,32,9294.9,144.4,73.8,13.55,,, +red-pajama-incite-chat-3b-v1,FP16,1024,6755.1,469.1,73.9,13.53,,, +qwen-7b-chat-gptq,INT4-MIXED,1024,9152.1,827.2,75.1,13.32,,, +gemma-7b-it,INT4-MIXED,32,7991.4,128.6,75.8,13.19,,, +chatglm2-6b,INT8-CW,32,6854.4,110.2,76.3,13.11,,, +chatglm3-6b,INT8-CW,32,6754.8,112.3,76.4,13.09,,, +stable-zephyr-3b-dpo,FP16,946,6940,428.6,76.7,13.04,,, +baichuan2-7b-chat,INT4-MIXED,1024,6930.2,1229.5,76.7,13.04,,, +gemma-7b-it,INT4-MIXED,32,8061.5,125.6,76.7,13.04,,, +stablelm-3b-4e1t,FP16,1024,6722.9,480.8,77,12.99,,, +phi-2,FP16,1024,6709.4,624.1,77.2,12.95,,, +chatglm2-6b,INT8-CW,1024,7132.9,1361.9,78.7,12.71,,, +chatglm3-6b,INT8-CW,1024,7037.5,1389.2,78.7,12.71,,, +qwen-7b-chat,INT4-MIXED,1024,10374.1,1357.5,81.1,12.33,,, +gemma-7b-it,INT4-MIXED,1024,9398,1268.5,82.7,12.09,,, +gemma-7b-it,INT4-MIXED,1024,9469.5,1268,83.2,12.02,,, +gpt-j-6b,INT8-CW,32,7126.5,255.2,87.2,11.47,,, +falcon-7b-instruct,INT8-CW,32,8287.6,131.1,88.4,11.31,,, +llama-2-7b-chat-hf,INT8-CW,32,7474.9,139.5,89.7,11.15,,, +codegen25-7b,INT8-CW,32,7559.4,138,90.8,11.01,,, +vicuna-7b-v1.5,INT8-CW,32,7390.8,136.6,90.8,11.01,,, +falcon-7b-instruct,INT8-CW,1024,8546.8,1205.9,92.2,10.85,,, +stablelm-7b,INT8-CW,32,8356.4,143,92.4,10.82,,, +qwen2-7b,INT8-CW,32,9940.7,132,92.5,10.81,,, +baichuan2-13b-chat,INT4-MIXED,32,9879.2,184.9,93.3,10.72,,, +phi-3-mini-4k-instruct,FP16,38,8290,125.2,93.4,10.71,,, +phi-3-mini-4k-instruct,FP16,31,8290.5,109.5,93.5,10.70,,, +gpt-j-6b,INT8-CW,1024,7759,1996.8,93.9,10.65,,, +llama-2-7b-chat-hf,INT8-CW,1024,8097.8,1701.6,94.7,10.56,,, +phi-3-medium-4k-instruct,INT4-MIXED,38,8210.4,527,95.1,10.52,,, +mistral-7b-v0.1,INT8-CW,31,7882.4,128.6,95.1,10.52,,, +vicuna-7b-v1.5,INT8-CW,1024,8013.2,1558.1,95.1,10.52,,, +mistral-7b-v0.1,INT8-CW,32,7886.9,140.6,95.2,10.50,,, +qwen2-7b,INT8-CW,1024,10573.1,1564.5,95.3,10.49,,, +codegen25-7b,INT8-CW,1024,8253.1,1526.3,95.7,10.45,,, +zephyr-7b-beta,INT8-CW,32,7785.3,144.4,95.8,10.44,,, +stablelm-7b,INT8-CW,1020,8921.9,1845,96.9,10.32,,, +mistral-7b-v0.1,INT8-CW,1007,8127.4,1648.4,97.4,10.27,,, +qwen-7b-chat,INT8-CW,32,11083.2,140.6,97.7,10.24,,, +qwen1.5-7b-chat,INT8-CW,32,8870,156.4,98.1,10.19,,, +llama-3.1-8b,INT8-CW,31,8600.3,189.2,98.4,10.16,,, +mistral-7b-v0.1,INT8-CW,1024,8134.7,1554.1,98.4,10.16,,, +qwen-14b-chat,INT4-MIXED,32,9876.2,192.3,98.6,10.14,,, +zephyr-7b-beta,INT8-CW,1024,8035.2,1580.4,98.8,10.12,,, +llama-3-8b,INT8-CW,32,8694.2,150.7,99.5,10.05,,, +llama-3-8b,INT8-CW,33,8700.4,175.4,99.8,10.02,,, +phi-3-mini-4k-instruct,FP16,1023,8795.2,601.3,99.9,10.01,,, diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv similarity index 100% rename from docs/sphinx_setup/_static/benchmarks_files/llm_models.csv rename to docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv From 9705c7e9e2f447ae14d1cad0066aa35d6df511bc Mon Sep 17 00:00:00 2001 From: Pawel Raasz Date: Mon, 18 Nov 2024 10:21:58 +0100 Subject: [PATCH 07/53] [docs] Align LSTMSequence specification attributes order with constructor (#27559) ### Details: - Align LSTMSequence specification attributes order with constructor ### Related PRs: - #27355 ### Tickets: - CVS-156182 Signed-off-by: Pawel Raasz --- .../sequence/lstm-sequence-5.rst | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst index 164033bdd2831c..abad632e5ae86c 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst @@ -26,13 +26,12 @@ A single cell in the sequence is implemented in the same way as in :doc:`LSTM Ce * **Type**: ``int`` * **Required**: *yes* -* *activations* +* *direction* - * **Description**: *activations* specifies activation functions for gates, there are three gates, so three activation functions should be specified as a value for this attributes - * **Range of values**: any combination of *relu*, *sigmoid*, *tanh* - * **Type**: a list of strings - * **Default value**: *sigmoid,tanh,tanh* - * **Required**: *no* + * **Description**: Specify if the RNN is forward, reverse, or bidirectional. If it is one of *forward* or *reverse* then ``num_directions = 1``, if it is *bidirectional*, then ``num_directions = 2``. This ``num_directions`` value specifies input/output shape requirements. + * **Range of values**: *forward*, *reverse*, *bidirectional* + * **Type**: ``string`` + * **Required**: *yes* * *activations_alpha, activations_beta* @@ -42,6 +41,14 @@ A single cell in the sequence is implemented in the same way as in :doc:`LSTM Ce * **Default value**: None * **Required**: *no* +* *activations* + + * **Description**: *activations* specifies activation functions for gates, there are three gates, so three activation functions should be specified as a value for this attributes + * **Range of values**: any combination of *relu*, *sigmoid*, *tanh* + * **Type**: a list of strings + * **Default value**: *sigmoid,tanh,tanh* + * **Required**: *no* + * *clip* * **Description**: *clip* specifies bound values *[-C, C]* for tensor clipping. Clipping is performed before activations. @@ -50,12 +57,6 @@ A single cell in the sequence is implemented in the same way as in :doc:`LSTM Ce * **Default value**: *infinity* that means that the clipping is not applied * **Required**: *no* -* *direction* - - * **Description**: Specify if the RNN is forward, reverse, or bidirectional. If it is one of *forward* or *reverse* then ``num_directions = 1``, if it is *bidirectional*, then ``num_directions = 2``. This ``num_directions`` value specifies input/output shape requirements. - * **Range of values**: *forward*, *reverse*, *bidirectional* - * **Type**: ``string`` - * **Required**: *yes* **Inputs** From a4ee2437410b27feece4d9acd55c5d6bbe92c7c0 Mon Sep 17 00:00:00 2001 From: yuanxion <96522341+yuanxion@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:40:08 +0800 Subject: [PATCH 08/53] [GPU] Fix segmentation fault in mvn update_shapes (#27263) ### Details: - *Add mutex for std::vector push_back in mvn update_shapes, to avoid multi-threads competition* ### Tickets: - *CVS-155564* --------- Signed-off-by: yuan.xiong --- .../intel_gpu/src/graph/impls/ocl/activation.cpp | 2 +- .../src/graph/impls/ocl/adaptive_pooling.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/arg_max_min.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/batch_to_space.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/broadcast.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/bucketize.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/concatenation.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/convert_color.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/convolution.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp | 2 +- .../src/graph/impls/ocl/ctc_greedy_decoder.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/ctc_loss.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/cum_sum.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/deconvolution.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/depth_to_space.cpp | 2 +- .../src/graph/impls/ocl/detection_output.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp | 2 +- .../src/graph/impls/ocl/dynamic_quantize.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/eltwise.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/embedding_bag.cpp | 2 +- .../ocl/experimental_detectron_detection_output.cpp | 2 +- ...tal_detectron_generate_proposals_single_image.cpp | 2 +- .../experimental_detectron_prior_grid_generator.cpp | 2 +- .../experimental_detectron_roi_feature_extractor.cpp | 2 +- .../impls/ocl/experimental_detectron_topk_rois.cpp | 2 +- .../src/graph/impls/ocl/extract_image_patches.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/eye.cpp | 2 +- .../src/graph/impls/ocl/fully_connected.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp | 2 +- .../src/graph/impls/ocl/gather_elements.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/gather_nd.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/gather_tree.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp | 2 +- .../src/graph/impls/ocl/generate_proposals.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/grid_sample.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/grn.cpp | 2 +- .../src/graph/impls/ocl/group_normalization.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/kv_cache.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/lrn.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/lstm_elt.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/matrix_nms.cpp | 2 +- .../src/graph/impls/ocl/multi_stage_primitive.hpp | 12 ++++++++++++ .../intel_gpu/src/graph/impls/ocl/multiclass_nms.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/multinomial.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp | 2 +- .../src/graph/impls/ocl/non_max_suppression.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/non_zero.cpp | 4 ++-- .../intel_gpu/src/graph/impls/ocl/normalize.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/one_hot.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/permute.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/pooling.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/primitive_base.hpp | 10 ++++++++++ .../intel_gpu/src/graph/impls/ocl/prior_box.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/quantize.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/random_uniform.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/region_yolo.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/reorder.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/reorg_yolo.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/resample.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/reshape.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/reverse.cpp | 2 +- .../src/graph/impls/ocl/reverse_sequence.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/roi_align.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/roi_pooling.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/roll.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp | 2 +- .../graph/impls/ocl/scaled_dot_product_attention.cpp | 2 +- .../src/graph/impls/ocl/scatter_elements_update.cpp | 2 +- .../src/graph/impls/ocl/scatter_nd_update.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/scatter_update.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp | 2 +- .../src/graph/impls/ocl/shuffle_channels.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/slice.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/softmax.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/space_to_batch.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/space_to_depth.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/strided_slice.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/swiglu.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/tile.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/unique.cpp | 4 ++-- 84 files changed, 106 insertions(+), 84 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp index 50da32e5dc59d3..28949bd7bab24d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp @@ -28,7 +28,7 @@ struct activation_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::activation_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/adaptive_pooling.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/adaptive_pooling.cpp index 9f8f3ecfa5b7e2..58c943d9747348 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/adaptive_pooling.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/adaptive_pooling.cpp @@ -19,7 +19,7 @@ struct adaptive_pooling_impl : public typed_primitive_impl_ocl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::adaptive_pooling_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp index 42c5208705e677..dd1e8d256860d7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp @@ -42,7 +42,7 @@ struct arg_max_min_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::arg_max_min_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/batch_to_space.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/batch_to_space.cpp index 87e73704b7e7cd..0863114f5c5456 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/batch_to_space.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/batch_to_space.cpp @@ -18,7 +18,7 @@ struct batch_to_space_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::batch_to_space_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp index 9bd345427c5619..b6016646023d36 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp @@ -21,7 +21,7 @@ struct border_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::border_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/broadcast.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/broadcast.cpp index b98a573152293c..177a449692732f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/broadcast.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/broadcast.cpp @@ -20,7 +20,7 @@ struct broadcast_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::broadcast_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/bucketize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/bucketize.cpp index 2dd979cc6a16be..7fb2ae2741c149 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/bucketize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/bucketize.cpp @@ -20,7 +20,7 @@ struct bucketize_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::bucketize_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp index 4cef58293cce3c..9fad28261698fb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp @@ -50,7 +50,7 @@ struct concatenation_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::concatenation_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/convert_color.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/convert_color.cpp index 670e733c0c4eba..8b44fd4c586e21 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/convert_color.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/convert_color.cpp @@ -19,7 +19,7 @@ struct convert_color_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::convert_color_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp index cda7d8f1a4cedc..d3fc6a3b486ee8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp @@ -24,7 +24,7 @@ struct convolution_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::convolution_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp index 612f2f1752995f..708ed26535edd1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp @@ -20,7 +20,7 @@ struct crop_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::crop_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp index 377c863b534f1e..0f62341000d8e1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp @@ -32,7 +32,7 @@ struct ctc_greedy_decoder_impl : typed_primitive_impl_ocl { public: std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_loss.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_loss.cpp index ae176c571ce331..e14b8bfb08e463 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_loss.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_loss.cpp @@ -20,7 +20,7 @@ struct ctc_loss_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::ctc_loss_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp index a7a1d124f29ac1..9a4c417d4f8678 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp @@ -53,7 +53,7 @@ struct cum_sum_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::cum_sum_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp index 95bd66867c1b8f..868d2e7845cc2e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp @@ -21,7 +21,7 @@ struct deconvolution_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::deconvolution_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp index b4e96641fc67e7..0635141667c273 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp @@ -19,7 +19,7 @@ struct depth_to_space_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::depth_to_space_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/detection_output.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/detection_output.cpp index d64076653d703a..67916155a9a99b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/detection_output.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/detection_output.cpp @@ -21,7 +21,7 @@ struct detection_output_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::detection_output_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } public: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp index 59e1f28e5afd2c..07b0690a7e7d67 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp @@ -19,7 +19,7 @@ struct dft_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::dft_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index 144f58a4c0c3ea..b9fe00ac525720 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -21,7 +21,7 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::dynamic_quantize_impl); std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp index ad836064f455d4..f8960274ab730f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp @@ -20,7 +20,7 @@ struct eltwise_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::eltwise_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp index 43ed52e50b81b3..6cf84edda58c7b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp @@ -19,7 +19,7 @@ struct embedding_bag_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::embedding_bag_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp index df65ba8beea5c6..126bc5b42e2406 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp @@ -20,7 +20,7 @@ struct experimental_detectron_detection_output_impl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::experimental_detectron_detection_output_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_generate_proposals_single_image.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_generate_proposals_single_image.cpp index 6242b987369126..92f62ecfe5fd1a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_generate_proposals_single_image.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_generate_proposals_single_image.cpp @@ -20,7 +20,7 @@ struct experimental_detectron_generate_proposals_single_image_impl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::experimental_detectron_generate_proposals_single_image_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_prior_grid_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_prior_grid_generator.cpp index ab37af9196a165..74f6d4d12ea578 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_prior_grid_generator.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_prior_grid_generator.cpp @@ -21,7 +21,7 @@ struct experimental_detectron_prior_grid_generator_impl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::experimental_detectron_prior_grid_generator_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_roi_feature_extractor.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_roi_feature_extractor.cpp index ae008547fe30b6..d3bbac2a7d1207 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_roi_feature_extractor.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_roi_feature_extractor.cpp @@ -19,7 +19,7 @@ struct experimental_detectron_roi_feature_extractor_impl : public typed_primitiv DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::experimental_detectron_roi_feature_extractor_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } event::ptr execute_impl(const std::vector& events, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_topk_rois.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_topk_rois.cpp index 177b9890309ebf..55cc1da32a3873 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_topk_rois.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_topk_rois.cpp @@ -20,7 +20,7 @@ struct experimental_detectron_topk_rois_impl : typed_primitive_impl_ocl clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/extract_image_patches.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/extract_image_patches.cpp index b8bcdf4082ee9a..0639f6b67a112e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/extract_image_patches.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/extract_image_patches.cpp @@ -30,7 +30,7 @@ struct extract_image_patches_impl : typed_primitive_impl_ocl clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/eye.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/eye.cpp index 1ecd2176a29034..8b3c8b82bc805d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/eye.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/eye.cpp @@ -20,7 +20,7 @@ struct eye_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::eye_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp index de17306a5eb2b2..04f691c2bd2ca9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp @@ -45,7 +45,7 @@ struct fully_connected_impl : typed_primitive_impl_ocl { } std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp index 43f413c7e083d6..60b73a3614ead6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp @@ -64,7 +64,7 @@ struct gather_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::gather_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp index 4a3f9d788355ee..3a5b6d22a352e8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp @@ -51,7 +51,7 @@ struct gather_elements_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::gather_elements_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp index 48416e76e43f08..01b87dff8335e7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp @@ -21,7 +21,7 @@ struct gather_nd_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::gather_nd_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp index 7b510d1d7e4a16..f25805ed8ade12 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp @@ -20,7 +20,7 @@ struct gather_tree_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::gather_tree_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp index 174ea1fa1767a9..7277b0068d1497 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp @@ -32,7 +32,7 @@ struct gemm_impl : multi_stage_primitive { const uint32_t indirect_gemm = 1; std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } gemm_impl() = default; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/generate_proposals.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/generate_proposals.cpp index 244c77151a812d..27da1d22d21aca 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/generate_proposals.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/generate_proposals.cpp @@ -20,7 +20,7 @@ struct generate_proposals_impl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::generate_proposals_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/grid_sample.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/grid_sample.cpp index 7265ee7c6387c8..16e460f07f6822 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/grid_sample.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/grid_sample.cpp @@ -49,7 +49,7 @@ struct grid_sample_impl : public typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::grid_sample_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/grn.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/grn.cpp index 3f3dd47e0236de..00a6eadc0da39b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/grn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/grn.cpp @@ -20,7 +20,7 @@ struct grn_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::grn_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } public: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp index c112fdfa11e40e..4d1beaee07d7b3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp @@ -18,7 +18,7 @@ struct group_normalization_impl : typed_primitive_impl_ocl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::group_normalization_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index d0fcace0b3f184..fef2a3c51ee821 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -68,7 +68,7 @@ struct kv_cache_impl : multi_stage_primitive { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::kv_cache_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } const size_t concat_stage = 0; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lrn.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lrn.cpp index 26fda8b7b65271..9c982b5daf27c2 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/lrn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lrn.cpp @@ -20,7 +20,7 @@ struct lrn_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lrn_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp index b055943e057fc1..5de12d83fdbab3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp @@ -20,7 +20,7 @@ struct lstm_elt_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_elt_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp index 4e6a32ace1d8ef..0159ed5a8bf5e8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp @@ -44,7 +44,7 @@ struct matrix_nms_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::matrix_nms_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp index 340fef53327de5..fb3839b6145dc6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp @@ -141,6 +141,18 @@ struct multi_stage_primitive : public typed_primitive_impl { return {kernels_cache.get_cached_kernel_ids(_kernels)}; } + template + static std::unique_ptr make_deep_copy(const ImplType& impl_ocl) { + auto prim_impl = make_unique(impl_ocl); + for (auto& _kernel_data : (*prim_impl)._kernels_data) { + KernelParamsType* params_ptr = dynamic_cast(_kernel_data.params.get()); + if (params_ptr != nullptr) { + _kernel_data.params = make_unique(*params_ptr); + } + } + return prim_impl; + } + std::vector get_kernels() const override { return _kernels; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multiclass_nms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multiclass_nms.cpp index c07bf9dac81daa..326e6b925dfd18 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multiclass_nms.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multiclass_nms.cpp @@ -40,7 +40,7 @@ struct multiclass_nms_impl : public typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::multiclass_nms_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp index d18838f819ed75..bba802a7d98ce9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp @@ -18,7 +18,7 @@ struct multinomial_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::multinomial_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp index 6c7dc79cc0aeb3..7ba564f485d4e5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp @@ -20,7 +20,7 @@ struct mvn_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::mvn_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp index c80d0f9f3a7028..3e75c44cfe066b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp @@ -21,7 +21,7 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::non_max_suppression_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp index bdd6eb37a9d767..44be7824d4b7dd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp @@ -22,7 +22,7 @@ struct count_nonzero_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::count_nonzero_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { @@ -67,7 +67,7 @@ struct gather_nonzero_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::gather_nonzero_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/normalize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/normalize.cpp index b6b705d82e4d20..d65bdb781b34b3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/normalize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/normalize.cpp @@ -20,7 +20,7 @@ struct normalize_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::normalize_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/one_hot.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/one_hot.cpp index abde905dbd2bc1..6b5fbdba4fadd9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/one_hot.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/one_hot.cpp @@ -20,7 +20,7 @@ struct one_hot_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::one_hot_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp index 38585fd380a720..d3559007cfcd1c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp @@ -47,7 +47,7 @@ struct permute_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::permute_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp index 7d341c46e023c5..136a18f47bd94e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp @@ -51,7 +51,7 @@ struct pooling_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::pooling_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index ad4d47ae6531f1..9b66eee4f90a01 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -171,6 +171,16 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { return {kernels_cache.get_cached_kernel_ids(_kernels)}; } + template + static std::unique_ptr make_deep_copy(const ImplType& impl_ocl) { + auto prim_impl = make_unique(impl_ocl); + KernelParamsType* params_ptr = dynamic_cast((*prim_impl)._kernel_data.params.get()); + if (params_ptr != nullptr) { + (*prim_impl)._kernel_data.params = make_unique(*params_ptr); + } + return prim_impl; + } + std::vector get_kernels() const override { return _kernels; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/prior_box.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/prior_box.cpp index b5e7c7b01c4ee8..7d10521f63f5a3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/prior_box.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/prior_box.cpp @@ -20,7 +20,7 @@ struct prior_box_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::prior_box_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp index 93249c08931b3a..3e8a5ded9dd2da 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp @@ -20,7 +20,7 @@ struct quantize_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::quantize_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/random_uniform.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/random_uniform.cpp index 3af560e3e2f270..f69a9f1dab9725 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/random_uniform.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/random_uniform.cpp @@ -20,7 +20,7 @@ struct random_uniform_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::random_uniform_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp index f6b142f1f13fa5..dcfdc766bf911c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp @@ -20,7 +20,7 @@ struct range_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::range_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp index d937d7fc8a190c..6ead31a1985bae 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp @@ -69,7 +69,7 @@ struct reduce_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::reduce_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/region_yolo.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/region_yolo.cpp index 98e1349b990f9a..e0d7c910fc8cc9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/region_yolo.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/region_yolo.cpp @@ -20,7 +20,7 @@ struct region_yolo_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::region_yolo_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp index 4ccbcbe9549347..02145301734779 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp @@ -22,7 +22,7 @@ struct reorder_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::reorder_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorg_yolo.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorg_yolo.cpp index 6aa2b6395b69c3..ab1e2bce6ad24d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorg_yolo.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorg_yolo.cpp @@ -20,7 +20,7 @@ struct reorg_yolo_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::reorg_yolo_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp index 1572b939132568..d1ef25dce6264a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp @@ -137,7 +137,7 @@ struct resample_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::resample_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reshape.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reshape.cpp index aa20d659e9179a..bac815666aa445 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reshape.cpp @@ -20,7 +20,7 @@ struct reshape_impl : public typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::reshape_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reverse.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reverse.cpp index aee736b6871299..745c2912170bc6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reverse.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reverse.cpp @@ -20,7 +20,7 @@ struct reverse_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::reverse_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reverse_sequence.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reverse_sequence.cpp index 03c555487e311c..7845d12a83687c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reverse_sequence.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reverse_sequence.cpp @@ -19,7 +19,7 @@ struct reverse_sequence_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::reverse_sequence_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp index f862a4c69e0b2d..209a411fbdb640 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp @@ -20,7 +20,7 @@ struct rms_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::rms_impl); std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp index f3865313fb020a..748a66fce8295d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp @@ -44,7 +44,7 @@ struct roi_align_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::roi_align_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/roi_pooling.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/roi_pooling.cpp index d898c70de82a01..a273496c5a878d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/roi_pooling.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/roi_pooling.cpp @@ -40,7 +40,7 @@ struct roi_pooling_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::roi_pooling_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } protected: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/roll.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/roll.cpp index 85cff366ac18bf..f7e01d060f2eae 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/roll.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/roll.cpp @@ -20,7 +20,7 @@ struct roll_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::roll_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp index 1221de7f67b323..27ce085ab83c3f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp @@ -20,7 +20,7 @@ struct rope_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::rope_impl); std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index f4791d38f88742..895fd86bb01e5f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -29,7 +29,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } scaled_dot_product_attention_impl() = default; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_elements_update.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_elements_update.cpp index 47d35bf21b5fdb..220900671b5d46 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_elements_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_elements_update.cpp @@ -68,7 +68,7 @@ struct scatter_elements_update_impl : typed_primitive_impl_ocl clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp index 9d02993aef9495..a97535f7fcf24d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp @@ -20,7 +20,7 @@ struct scatter_nd_update_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::scatter_nd_update_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp index cc9eca7f284643..c99e25d3ab6174 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp @@ -46,7 +46,7 @@ struct scatter_update_impl : typed_primitive_impl_ocl { DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::scatter_update_impl) std::unique_ptr clone() const override { - return make_unique(*this); + return make_deep_copy(*this); } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp index d84ff02b1c44d4..934bba2eca3b90 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp @@ -20,7 +20,7 @@ struct select_impl : typed_primitive_impl_ocl { const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { - for (auto e : events) { - e->wait(); - } + stream.wait_for_events(events); } auto params = instance.get_impl_params(); @@ -87,14 +86,10 @@ struct select_impl : public typed_primitive_impl