From d62effb86b50781efa24af18a8af77bef9bd11db Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Fri, 6 Dec 2024 13:24:22 +0800 Subject: [PATCH 01/23] [CPU] Optimize small batch case for PagedAttention (#27847) ### Details: - *Generate more work items to avoid thread imbalance* - *...* ### Tickets: - *[156347](https://jira.devtools.intel.com/browse/CVS-156347)* - *[158477](https://jira.devtools.intel.com/browse/CVS-158477)* --- .../nodes/kernels/scaled_attn/executor_pa.cpp | 105 ++++++++++++++---- 1 file changed, 86 insertions(+), 19 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index bef34881ca41bc..90167ac86a8e1a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -939,14 +939,14 @@ struct MHAHelper { // wv_scratch_b: [rnd_up(kv_len, block_size), Hk, scratch_b_size] void exec_kernel_multiple(const PlainTensor& query, const PlainTensor& present_value, const PlainTensor& output_emb, const PlainTensor& qk_scratch_b, const PlainTensor& wv_scratch_b, const int32_t* block_table, size_t ithr, size_t q_blk, - size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { + size_t hq_beg, size_t hq_end, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { auto q_start = q_blk * _block_size; auto q_end = std::min(q_start + _block_size, q_len); auto q_cnt = q_end - q_start; constexpr bool q_is_xf16 = one_of(precision_of::value, ov::element::bf16, ov::element::f16); constexpr bool q_cache_is_same = precision_of::value == precision_of::value; auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size); - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { auto* q_ptr = query.ptr(h, q_start, 0); float* c_ptr = _weight.ptr(ithr, h, 0, 0); // for each query block, loop through all key block @@ -1065,13 +1065,14 @@ struct MHAHelper { // weight: [nthr, H, 32, rnd_up(kv_len, block_size)] // output: [nthr, 32, H, S] void exec_kernel_one_bh(const PlainTensor& query, const PlainTensor& present_key, const PlainTensor& present_value, const PlainTensor& output_emb, - const int32_t* block_table, size_t ithr, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { + const int32_t* block_table, size_t ithr, size_t hq_beg, size_t hq_end, size_t hk, + size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { (*_gemv)(query.ptr(h, pq), present_key.ptr(block_number, hk), _weight.ptr(ithr, h, pq) + pk); } @@ -1082,7 +1083,7 @@ struct MHAHelper { for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { dot_product_block(query.ptr(h, pq), present_key.ptr(block_number, hk), _weight.ptr(ithr, h, pq) + pk, _S, std::min(_block_size, cur_kv_len - pk)); } @@ -1091,7 +1092,7 @@ struct MHAHelper { } for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { // apply attention mask & sofmax float* alibi_lookup = nullptr; float alibi_slope = 0.f; @@ -1122,7 +1123,7 @@ struct MHAHelper { auto block_number = block_table[i]; auto* v = present_value.ptr(block_number, hk); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { attn_acc_value_block(_output.ptr(ithr, pq, h), _weight.ptr(ithr, h, pq) + pv, v, @@ -1133,7 +1134,7 @@ struct MHAHelper { } // convert to dst for (size_t pq = 0; pq < q_len; pq++) - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) + for (size_t h = hq_beg; h < hq_end; h++) cvt_copy(output_emb.ptr(pq, h * _SV), _output.ptr(ithr, pq, h), _SV); } @@ -1162,8 +1163,38 @@ struct MHAHelper { // aligned to cache line (64bytes=16*sizeof(float)) to avoid false sharing _weight_bhl.resize({B, _H, q_len, rnd_up(max_context_len, std::max(_block_size, size_t{16}))}); - parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pk_in_blocks, size_t hk) { + // for small batches dynamic scheduler has notable overhead + bool prefer_static_loop; + // if less than 2 work items per thread, loop H + bool loop_hk = B * kv_len_in_blocks * _Hk <= 2 * _nthr ? false : true; + if (B <= 32) { + prefer_static_loop = true; + // small batch and all batch size is same(like SDPA case) + auto kv_len = past_lens.ptr()[0]; + for (size_t b = 1; b < B; b++) { + if (past_lens.ptr()[b] != kv_len) + prefer_static_loop = false; + } + } else { + // for bigger batch skip the test to save the cost + prefer_static_loop = false; + } + auto get_h_params = [] (bool loop_hk, size_t hx, size_t h_each_group_len, size_t& hq_beg, size_t& hq_end, size_t& hk) { + if (loop_hk) { + hk = hx; + hq_beg = hk * h_each_group_len; + hq_end = (hk + 1) * h_each_group_len; + } else { + hq_beg = hx; + hq_end = hx + 1; + hk = hx / h_each_group_len; + } + }; + auto loop_qk = [&](size_t b, size_t pk_in_blocks, size_t hx) { auto context_len = static_cast(past_lens.ptr()[b]) + 1; + size_t hk, hq_beg, hq_end; + get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk); + // kv_len must be valid auto pk = pk_in_blocks * _block_size; if (pk < context_len) { @@ -1171,7 +1202,7 @@ struct MHAHelper { if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { (*_gemv)(query.ptr(b, h, pq), present_key.ptr(block_number, hk), _weight_bhl.ptr(b, h, pq) + pk); } @@ -1179,16 +1210,16 @@ struct MHAHelper { _gemv->tile_release(); } else { for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { dot_product_block(query.ptr(b, h, pq), present_key.ptr(block_number, hk), _weight_bhl.ptr(b, h, pq) + pk, _S, std::min(_block_size, context_len - pk)); } } } } - }); + }; - parallel_for3d_dynamic(B, _H, q_len, [&](size_t b, size_t h, size_t pq) { + auto loop_softmax = [&](size_t b, size_t h, size_t pq) { auto cur_kv_len = static_cast(past_lens.ptr()[b]) + 1; auto ncausal = cur_kv_len; // apply attention mask & sofmax @@ -1210,7 +1241,16 @@ struct MHAHelper { ov::element::f32, ov::element::f32, alibi_slope); - }); + }; + + size_t h_dims = loop_hk ? _Hk : _H; + if (prefer_static_loop) { + parallel_for3d(B, kv_len_in_blocks, h_dims, loop_qk); + parallel_for3d(B, _H, q_len, loop_softmax); + } else { + parallel_for3d_dynamic(B, kv_len_in_blocks, h_dims, loop_qk); + parallel_for3d_dynamic(B, _H, q_len, loop_softmax); + } if (output_score) { parallel_for2d_dynamic(B, q_len, [&](size_t b, size_t pq) { @@ -1229,16 +1269,19 @@ struct MHAHelper { memset(_output_bhl.ptr(ithr, 0, 0, 0, 0), 0, _output_bhl.stride(0) * sizeof(float)); }); - parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pv_in_blocks, size_t hk) { + auto loop_wk = [&](size_t b, size_t pv_in_blocks, size_t hx) { auto ithr = parallel_get_thread_num(); auto context_len = static_cast(past_lens.ptr()[b]) + 1; auto pv = pv_in_blocks * _block_size; + size_t hk, hq_beg, hq_end; + get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk); + // kv_len must be valid if (pv < context_len) { auto block_number = block_indices.ptr()[block_indices_begins.ptr()[b] + pv_in_blocks]; auto* v = present_value.ptr(block_number, hk); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { attn_acc_value_block(_output_bhl.ptr(ithr, b, pq, h), _weight_bhl.ptr(b, h, pq) + pv, v, @@ -1247,7 +1290,13 @@ struct MHAHelper { } } } - }); + }; + + if (prefer_static_loop) { + parallel_for3d(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk); + } else { + parallel_for3d_dynamic(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk); + } parallel_for3d(B, _H, q_len, [&](size_t b, size_t h, size_t pq) { auto* temp = _output_bhl.ptr(0, b, pq, h); @@ -1416,7 +1465,23 @@ struct MHA { } }); - parallel_for2d_dynamic(attn_work_count, Hk, [&](size_t w, size_t hk) { + // loop along HK dimension: if mixed first/second token and elements count is enough, loop HK to reuse KV in the CPU cache + // else if elements count is small, prefer to loop H to get more work to avoid thread imbalance + bool loop_hk = _workitems.get_reorder_max_batch_size() == past_lens.m_dims[0] || // if only first token, loop H + attn_work_count * Hk <= 2 * _helper._nthr ? false : true; // or less than 2 work items per thread, loop H + + parallel_for2d_dynamic(attn_work_count, loop_hk ? Hk : _helper._H, [&](size_t w, size_t hx) { + size_t hk, hq_beg, hq_end; + if (loop_hk) { + hk = hx; + hq_beg = hk * _helper._h_each_group_len; + hq_end = (hk + 1) * _helper._h_each_group_len; + } else { + hq_beg = hx; + hq_end = hx + 1; + hk = hx / _helper._h_each_group_len; + } + const auto& item = _workitems.get_attn_work_item(w); const auto batch_in_seq = item.batch_in_seq; const auto batch_in_token = subsequence_begins.ptr()[batch_in_seq]; @@ -1434,7 +1499,7 @@ struct MHA { _helper.exec_kernel_one_bh(q.slice(0, batch_in_token, batch_in_token), k_cache, v_cache, output_emb.slice(0, batch_in_token, batch_in_token), block_indices.ptr() + block_indices_begins.ptr()[batch_in_seq], - ithr, hk, 1ul, cur_kv_len, alibi_slopes, + ithr, hq_beg, hq_end, hk, 1ul, cur_kv_len, alibi_slopes, score_output); } else { const auto batch_in_reorder = item.batch_in_reorder; @@ -1461,6 +1526,8 @@ struct MHA { block_indices.ptr() + block_indices_begins.ptr()[batch_in_seq], ithr, q_blk, + hq_beg, + hq_end, hk, q_len, cur_kv_len, From 0dd7434e299bbb2e85a936ab42e9a8bc40729f75 Mon Sep 17 00:00:00 2001 From: Egor Duplenskii Date: Fri, 6 Dec 2024 06:43:59 +0100 Subject: [PATCH 02/23] [CPU][Refactoring] Introduce VariableExecutor (#27883) Depending on the parameters a `FullyConnected` node can use one or multiple executors. With the current approach, even when just a single executor is used, every `prepareParams()` (executor::update()) call goes through executor selection routine. The idea is to avoid such `prepareParams()` overhead for a single executor scenarious, which are probably the most common ones. Thus, split the pipeline input two branches: - only single simple executor is used and updated - a `VariableExecutor` is used and updated. `VariableExecutor` contains two or more simple executors --- .../executors/dnnl/dnnl_fullyconnected.hpp | 4 +- .../src/nodes/executors/executor_factory.hpp | 201 ++++-------------- .../fullyconnected_implementations.cpp | 1 + .../src/nodes/executors/graph_emitter.hpp | 46 +++- .../src/nodes/executors/variable_executor.hpp | 140 ++++++++++++ .../intel_cpu/src/nodes/fullyconnected.cpp | 18 +- .../intel_cpu/src/nodes/fullyconnected.h | 4 +- 7 files changed, 232 insertions(+), 182 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp index 3266bf8965c37b..1d078feaa6549b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp @@ -8,12 +8,12 @@ #include #include "cpu_memory.h" -#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp" -#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" #include "nodes/executors/dnnl/dnnl_aliases.hpp" +#include "nodes/executors/dnnl/dnnl_utils.hpp" #include "nodes/executors/executor.hpp" #include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/memory_arguments.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp index 419ab4abf52cd7..f12795d5d1eb16 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2022 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,50 +6,22 @@ #include #include -#include #include "executor.hpp" -#include "nodes/executors/implementations.hpp" #include "nodes/executors/executor_config.hpp" #include "nodes/executors/executor_implementation.hpp" #include "nodes/executors/graph_emitter.hpp" +#include "nodes/executors/implementations.hpp" #include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/printers.hpp" -#include "openvino/core/except.hpp" +#include "nodes/executors/variable_executor.hpp" #include "post_ops.hpp" namespace ov { namespace intel_cpu { using namespace executor; -template -static ExecutorPtr fallback(const executor::Config& config, - const executor::Config& fallbackConfig, - const MemoryArgs& memory, - const ExecutorContext::CPtr context, - const std::string& name) { - DEBUG_LOG("Falling back to graph executor for ", - name, - ". Original config: ", - config, - " new config:", - fallbackConfig); - - GraphEmitter graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name); - - const auto& graphExecutor = - graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context) - .ensureAttrsMatch() - .ensureSrcDescsMatch() - .ensureDstDescsMatch() - .ensurePostOpsMatch() - .emit(); - (void)graphExecutor; - - OPENVINO_THROW("Fallback logic is not implemented yet"); // return graphExecutor; -} - -template +template class ExecutorFactory { public: using ExecutorImplementationRef = std::reference_wrapper>; @@ -62,9 +34,7 @@ class ExecutorFactory { : m_attrs(attrs), m_postOps(postOps), m_context(context), - m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)), - m_implementationRequiresFallback(m_suitableImplementations.size(), true), - m_executors(m_suitableImplementations.size()) {} + m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)) {} /** * @brief Retrieves the proper memory descriptors based on the provided memory descriptors. @@ -95,104 +65,42 @@ class ExecutorFactory { } /** - * @brief Preconfigures an executor based on the provided memory arguments. - * - * Preconfigures an executor by selecting an appropriate implementation based on the provided - * memory arguments and by creating an executor using the implementation. - * - * @param memory The memory parameters used for selecting the appropriate executor implementation. - * - * @note The main use case is to offload executor data preparation (i.e. weights packing) - * From the make() call - * @todo Currently supports creating a single executor. - * For some nodes it can be worth to preconfigure all the executors. - */ - void preconfigure(const MemoryArgs& memory) { - executor::Config config{memoryDescsFromMemory(memory), m_attrs, m_postOps}; - - cacheFallbackStatus(config); - - const size_t implId = select(memory, 0); - const auto& impl = m_suitableImplementations[implId].get(); - DEBUG_LOG("Preconfiguring executor: ", impl.name()); - - if (m_implementationRequiresFallback[implId]) { - if (auto fallbackConfig = impl.requiresFallback(config)) { - fallback(config, *fallbackConfig, memory, m_context, impl.name()); - } - } - - (void)create(implId, memory, m_context); - } - - /** - * @brief Creates an Executor instance based on provided memory arguments. + * @brief Creates an Executor instance based on the provided memory arguments. * - * Creates an Executor instance using the provided MemoryArgs, selecting an appropriate implementation - * based on the characteristics of the memory. It handles fallback scenarios if necessary and updates the executor - * with the given memory information. + * Depending on the number of available implementations, returns: + * - VariableExecutor, if the number of implementations is two or more + * - Simple Executor, if there is only one available implementation * * @param memory memory arguments. * * @return A shared pointer to the created Executor. - * - * The function follows the steps below: - * - Selects an implementation based on the provided memory using the select() function. - * - Retrieves the selected implementation and checks if fallback is required. - * - If fallback is required, it creates a fallback configuration and returns a fallback executor. - * - Otherwise creates the executor using the selected implementation. - * - Updates the executor with the given memory information. - * */ - ExecutorPtr make(MemoryArgs& memory) { - auto createExec = [this](MemoryArgs& memory, size_t implId) -> ExecutorPtr { - const auto& impl = m_suitableImplementations[implId].get(); - if (m_implementationRequiresFallback[implId]) { - executor::Config config{memoryDescsFromMemory(memory), m_attrs, m_postOps}; - if (auto fallbackConfig = impl.requiresFallback(config)) { - return fallback(config, *fallbackConfig, memory, m_context, impl.name()); - } - } - const auto executor = create(implId, memory, m_context); - if (!executor->update(memory)) { - return nullptr; + ExecutorPtr make(const MemoryArgs& memory) { + // only single executor is available + if (m_suitableImplementations.size() == 1) { + auto config = GraphEmitter::createConfig(memory, m_attrs, m_postOps); + + const auto& theOnlyImplementation = m_suitableImplementations.front().get(); + + if (const auto fallbackConfig = theOnlyImplementation.requiresFallback(config)) { + return GraphEmitter::fallback(config, + *fallbackConfig, + memory, + m_context, + theOnlyImplementation.name()); } - return executor; - }; - - auto implId = select(memory, 0); - auto executor = createExec(memory, implId); - while (!executor) { - implId = select(memory, ++implId); - executor = createExec(memory, implId); - } - return executor; - } -private: - static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) { - MemoryDescArgs memoryDescs; - memoryDescs.reserve(memory.size()); - - for (const auto& mem : memory) { - memoryDescs[mem.first] = mem.second->getDescPtr(); + return theOnlyImplementation.create(m_attrs, m_postOps, memory, m_context); } - return memoryDescs; - } - - /** - * @brief Caches the fallback status for each suitable implementation. - */ - void cacheFallbackStatus(const executor::Config& config) { - std::transform(m_suitableImplementations.begin(), - m_suitableImplementations.end(), - m_implementationRequiresFallback.begin(), - [&config](const ExecutorImplementationRef& impl) { - return impl.get().requiresFallback(config); - }); + return std::make_shared>(memory, + m_attrs, + m_postOps, + m_context, + m_suitableImplementations); } +private: /** * @brief Filters and retrieves suitable implementations based on the provided executor configuration. * @@ -205,11 +113,10 @@ class ExecutorFactory { * @note If an implementation is shape agnostic, no further implementations with lower * priority are considered. */ - static std::vector filter( - const Attrs& attrs, - const PostOps& postOps, - const MemoryDescArgs& descs, - const std::string& implementationPriority = {}) { + static std::vector filter(const Attrs& attrs, + const PostOps& postOps, + const MemoryDescArgs& descs, + const std::string& implementationPriority = {}) { const auto& implementations = getImplementations(); std::vector suitableImplementations; const executor::Config config{descs, attrs, postOps}; @@ -244,51 +151,17 @@ class ExecutorFactory { return suitableImplementations; } - size_t select(const MemoryArgs& memory, const size_t startIdx) const { - OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(), - "Failed to find an implementation since start indx: ", startIdx, - " is out of range of the suitable implementations array: ", m_suitableImplementations.size()); - auto startIt = m_suitableImplementations.begin(); - std::advance(startIt, startIdx); - const auto selectedImplementation = - std::find_if(startIt, - m_suitableImplementations.end(), - [&memory](const ExecutorImplementationRef& implementation) { - return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory); - }); - OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation"); - - return std::distance(m_suitableImplementations.begin(), selectedImplementation); - } - - ExecutorPtr create(const size_t implId, - const MemoryArgs& memory, - const ExecutorContext::CPtr context) { - assert(implId < m_executors.size() && implId < m_suitableImplementations.size()); - - if (!m_executors[implId]) { - const auto& impl = m_suitableImplementations[implId].get(); - m_executors[implId] = impl.create(m_attrs, m_postOps, memory, context); - } - - return m_executors[implId]; - } - const Attrs& m_attrs; const PostOps& m_postOps; const ExecutorContext::CPtr m_context; std::vector m_suitableImplementations; - // stores fallback status to avoid performing the check for every make() call - std::vector m_implementationRequiresFallback; - // executors cache - std::vector m_executors; }; -template -using ExecutorFactoryPtr = std::shared_ptr>; +template +using ExecutorFactoryPtr = std::shared_ptr>; -template -using ExecutorFactoryCPtr = std::shared_ptr>; +template +using ExecutorFactoryCPtr = std::shared_ptr>; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 5834c3dda4b262..4cf6992985ecd3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -11,6 +11,7 @@ #include "memory_desc/cpu_memory_desc.h" #include "nodes/executors/convolution_config.hpp" #include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" +#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp" #include "nodes/executors/dnnl/dnnl_fullyconnected.hpp" #include "nodes/executors/dnnl/dnnl_matmul_primitive.hpp" #include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp index 6aad18c793c8cf..784ed8bc778840 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp @@ -5,12 +5,11 @@ #pragma once #include -#include #include "graph.h" -#include "memory_desc/cpu_memory_desc.h" #include "node.h" #include "nodes/executors/executor.hpp" +#include "nodes/executors/executor_config.hpp" #include "post_ops.hpp" namespace ov { @@ -72,6 +71,49 @@ class GraphEmitter { return graph; } + static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) { + MemoryDescArgs memoryDescs; + memoryDescs.reserve(memory.size()); + + for (const auto& mem : memory) { + memoryDescs[mem.first] = mem.second->getDescPtr(); + } + + return memoryDescs; + } + + static executor::Config createConfig(const MemoryArgs& memory, + const Attrs& attrs, + const PostOps& postOps) { + return executor::Config{memoryDescsFromMemory(memory), attrs, postOps}; + } + + static ExecutorPtr fallback(const executor::Config& config, + const executor::Config& fallbackConfig, + const MemoryArgs& memory, + const ExecutorContext::CPtr context, + const std::string& name) { + DEBUG_LOG("Falling back to graph executor for ", + name, + ". Original config: ", + config, + " new config:", + fallbackConfig); + + GraphEmitter graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name); + + const auto& graphExecutor = + graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context) + .ensureAttrsMatch() + .ensureSrcDescsMatch() + .ensureDstDescsMatch() + .ensurePostOpsMatch() + .emit(); + (void)graphExecutor; + + OPENVINO_THROW("Fallback logic is not implemented yet"); // return graphExecutor; + } + private: const MemoryDescArgs& descs; const Attrs& attrs; diff --git a/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp new file mode 100644 index 00000000000000..8dfb7a4c63fde4 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "executor.hpp" +#include "executor_config.hpp" +#include "executor_implementation.hpp" +#include "nodes/executors/graph_emitter.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * A stateful (variable) executor + * Contains two or more executors. + * Switches between the executors based on provided Memory (more precisely based on in / out shapes) + */ +template +class VariableExecutor : public Executor { +public: + using ExecutorImplementationRef = std::reference_wrapper>; + + VariableExecutor(const MemoryArgs& memory, + const Attrs& attrs, + const PostOps& postOps, + const ExecutorContext::CPtr context, + std::vector suitableImplementations) + : m_attrs(attrs), + m_postOps(postOps), + m_context(context), + m_suitableImplementations(std::move(suitableImplementations)), + m_implementationRequiresFallback( + cacheFallbackStatus(m_suitableImplementations, + GraphEmitter::createConfig(memory, m_attrs, m_postOps))), + m_executors(m_suitableImplementations.size()) { + const size_t implId = select(memory, 0); + m_executors[implId] = create(implId, memory); + m_implId = implId; + } + + bool update(const MemoryArgs& memory) override { + for (auto implId = select(memory, 0); implId < m_suitableImplementations.size(); + implId = select(memory, implId)) { + if (!m_executors[implId]) { + m_executors[implId] = create(implId, memory); + } + + if (m_executors[implId]->update(memory)) { + m_implId = implId; + return true; + } + } + + return false; + } + + void execute(const MemoryArgs& memory) override { + m_executors[m_implId]->execute(memory); + } + + impl_desc_type implType() const override { + return m_executors[m_implId]->implType(); + } + + void moveMemToNumaNode(int numaID) override { + m_executors[m_implId]->moveMemToNumaNode(numaID); + } + +private: + /** + * @brief Returns a fallback status for each suitable implementation. + */ + static std::vector cacheFallbackStatus(const std::vector& suitableImplementations, + const executor::Config& config) { + std::vector implementationRequiresFallback(suitableImplementations.size()); + std::transform(suitableImplementations.begin(), + suitableImplementations.end(), + implementationRequiresFallback.begin(), + [&config](const ExecutorImplementationRef& impl) { + return impl.get().requiresFallback(config); + }); + + return implementationRequiresFallback; + } + + size_t select(const MemoryArgs& memory, const size_t startIdx) const { + OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(), + "Failed to find an implementation since start indx: ", + startIdx, + " is out of range of the suitable implementations array: ", + m_suitableImplementations.size()); + + auto startIt = m_suitableImplementations.begin() + startIdx; + + const auto selectedImplementation = + std::find_if(startIt, + m_suitableImplementations.end(), + [&memory](const ExecutorImplementationRef& implementation) { + return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory); + }); + + OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation"); + + return std::distance(m_suitableImplementations.begin(), selectedImplementation); + } + + ExecutorPtr create(const size_t implId, const MemoryArgs& memory) { + assert(implId < m_executors.size() && implId < m_suitableImplementations.size()); + + auto createWithFallback = [this](const size_t implId, const MemoryArgs& memory) { + const auto& impl = m_suitableImplementations[implId].get(); + + if (m_implementationRequiresFallback[implId]) { + auto config = GraphEmitter::createConfig(memory, m_attrs, m_postOps); + if (auto fallbackConfig = impl.requiresFallback(config)) { + return GraphEmitter::fallback(config, *fallbackConfig, memory, m_context, impl.name()); + } + } + + return impl.create(m_attrs, m_postOps, memory, m_context); + }; + + return createWithFallback(implId, memory); + } + + const Attrs& m_attrs; + const PostOps& m_postOps; + const ExecutorContext::CPtr m_context; + std::vector m_suitableImplementations; + // stores fallback status to avoid performing the check for every make() call + std::vector m_implementationRequiresFallback; + // executors cache + std::vector m_executors; + size_t m_implId; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 307125ef0069e0..31ae4f26cc08a1 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -66,7 +66,7 @@ void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) // init tp_cfg.w_rank and tp_cfg.w_size tp_cfg.w_rank = context->getCPUStreamExecutor()->get_rank()[0]; tp_cfg.w_size = ov::threading::message_manager()->get_num_sub_streams(); - tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1 ? true : false; + tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1; tp_cfg.sub_memory = context->getSubMemory(); } } @@ -119,16 +119,12 @@ void FullyConnected::needPrepareParamsForTensorParallel() { } } -ExecutorPtr FullyConnected::createExecutor() { - const auto& executor = factory->make(memory); - getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); - - return executor; -} - void FullyConnected::prepareParams() { needPrepareParamsForTensorParallel(); - executor = createExecutor(); + + executor->update(memory); + // @todo avoid updating implementation type in scope of every prepareParams call + getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); } void FullyConnected::initTensorParallelSync() { @@ -431,7 +427,7 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { needUpdateZeroPointForTensorParallel(); auto executionContext = std::make_shared(context, getImplPriority(), privateWeightCache); - factory = std::make_shared>(attrs, postOps, executionContext, descs); + factory = std::make_shared>(attrs, postOps, executionContext, descs); const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs); NodeConfig nodeConfig; @@ -496,7 +492,7 @@ void FullyConnected::createPrimitive() { needSplitMemoryForTensorParallel(); // @todo should we preconfigure only for dynamic shapes? // Since for static shapes primitive is created in scope of compile_model() anyway - factory->preconfigure(memory); + executor = factory->make(memory); Node::createPrimitive(); } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index be29342b851988..8c17228e365af4 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -16,7 +16,6 @@ #include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/fullyconnected_config.hpp" #include "post_ops.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" namespace ov { namespace intel_cpu { @@ -85,7 +84,6 @@ class FullyConnected : public Node { static const size_t WEIGHTS_ID = 1; static const size_t BIAS_ID = 2; - ExecutorPtr createExecutor(); void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr); void initTensorParallelConfig(const GraphContext::CPtr context); @@ -103,7 +101,7 @@ class FullyConnected : public Node { FCAttrs attrs; PostOps postOps; MemoryArgs memory; - ExecutorFactoryPtr factory; + ExecutorFactoryPtr factory; ExecutorPtr executor = nullptr; std::string errorPrefix; From fb1810b8ce36f7d8e7be26a0d5e71444f8c8f047 Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Fri, 6 Dec 2024 06:52:36 +0100 Subject: [PATCH 03/23] [tests/llm] Reorder imports to avoid onnx-related DDL load fail (#27942) ### Details: There is an issue with ONNX>=1.17 which causes DLL load failures on Windows. Previously it caused WWB import to fail (CVS-158774), it was fixed in https://github.com/openvinotoolkit/openvino.genai/pull/1301. Now this llm tests failure comes from the next import, optimum.intel.openvino, and it doesn't reproduce locally if optimum.intel.openvino is imported before WWB. Signed-off-by: Alina Kladieva --- tests/llm/accuracy_conformance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm/accuracy_conformance.py b/tests/llm/accuracy_conformance.py index 41015d7664ecc2..7f75a8e912bbd6 100644 --- a/tests/llm/accuracy_conformance.py +++ b/tests/llm/accuracy_conformance.py @@ -5,9 +5,9 @@ import tempfile import pytest -import whowhatbench as wwb from optimum.intel.openvino import (OVModelForCausalLM, OVWeightQuantizationConfig) +import whowhatbench as wwb from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed logging.basicConfig(level=logging.INFO) From 536bd69ed66a57869aa6d3bbe06692217997e67e Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Fri, 6 Dec 2024 11:50:30 +0400 Subject: [PATCH 04/23] [GPU] Parse runtime_options from model RT info and apply to config (#27900) ### Details: - Added conversion logic from RT Info attributes to plugin property for limited set of properties. Signed-off-by: Vladimir Paramuzov --- .../intel_gpu/runtime/execution_config.hpp | 14 +++ src/plugins/intel_gpu/src/plugin/plugin.cpp | 4 + .../src/runtime/execution_config.cpp | 6 ++ .../tests/functional/behavior/properties.cpp | 99 +++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 src/plugins/intel_gpu/tests/functional/behavior/properties.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp index 0af98bf1e952d0..3e854e4c9c5ada 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp @@ -138,6 +138,10 @@ class ExecutionConfig { void apply_user_properties(const cldnn::device_info& info); + // Note that RT info property value has lower priority than values set by user via core.set_property or passed to compile_model call + // So this method should be called after setting all user properties, but before apply_user_properties() call. + void apply_rt_info(const ov::RTMap& rt_info); + std::string to_string() const; protected: @@ -147,6 +151,16 @@ class ExecutionConfig { void apply_priority_hints(const cldnn::device_info& info); void apply_debug_options(const cldnn::device_info& info); + template + void apply_rt_info_property(const ov::Property& property, const ov::RTMap& rt_info) { + if (!is_set_by_user(property)) { + auto rt_info_val = rt_info.find(property.name()); + if (rt_info_val != rt_info.end()) { + set_user_property(property(rt_info_val->second.template as())); + } + } + } + private: ov::AnyMap internal_properties; ov::AnyMap user_properties; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 7775a153a99e8f..c8839472a6d962 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -189,6 +189,8 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< ExecutionConfig config = m_configs_map.at(device_id); config.set_user_property(orig_config); + if (model->has_rt_info("runtime_options")) + config.apply_rt_info(model->get_rt_info("runtime_options")); config.apply_user_properties(context->get_engine().get_device_info()); set_cache_info(model, config); @@ -278,6 +280,8 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& ExecutionConfig config = m_configs_map.at(device_id); config.set_user_property(orig_config); + if (model->has_rt_info("runtime_options")) + config.apply_rt_info(model->get_rt_info("runtime_options")); config.apply_user_properties(ctx->get_engine().get_device_info()); ProgramBuilder prog(ctx->get_engine(), config); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 4eaccf5540bd2a..30a9477e1600dd 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -257,6 +257,12 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) { user_properties.clear(); } +void ExecutionConfig::apply_rt_info(const ov::RTMap& rt_info) { + apply_rt_info_property(ov::hint::kv_cache_precision, rt_info); + apply_rt_info_property(ov::hint::dynamic_quantization_group_size, rt_info); + apply_rt_info_property(ov::hint::activations_scale_factor, rt_info); +} + std::string ExecutionConfig::to_string() const { std::stringstream s; s << "internal properties:\n"; diff --git a/src/plugins/intel_gpu/tests/functional/behavior/properties.cpp b/src/plugins/intel_gpu/tests/functional/behavior/properties.cpp new file mode 100644 index 00000000000000..93a00262db35c2 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/behavior/properties.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/properties.hpp" +#include "base/ov_behavior_test_utils.hpp" +#include "openvino/runtime/core.hpp" +#include "common_test_utils/subgraph_builders/conv_pool_relu.hpp" + +namespace { + +class TestPropertiesGPU : public ::testing::Test { +public: + std::shared_ptr model; + + void SetUp() override { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); + model = ov::test::utils::make_conv_pool_relu(); + } +}; + +TEST_F(TestPropertiesGPU, NoRTInfo) { + ov::Core core; + ov::Any type; + ov::Any size; + ov::Any scale; + ov::CompiledModel compiled_model; + + OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU)); + OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor)); +} + +TEST_F(TestPropertiesGPU, RTInfoPropertiesWithDefault) { + ov::Core core; + ov::Any type; + ov::Any size; + ov::Any scale; + ov::CompiledModel compiled_model; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name()); + + OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU)); + OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor)); + ASSERT_EQ(type.as(), ov::element::f16); + ASSERT_EQ(size.as(), 0); + ASSERT_EQ(scale.as(), 8.0f); +} + +TEST_F(TestPropertiesGPU, RTInfoPropertiesWithUserValuesFromCore) { + ov::Core core; + ov::Any type; + ov::Any size; + ov::Any scale; + ov::CompiledModel compiled_model; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name()); + core.set_property(ov::hint::kv_cache_precision(ov::element::u8)); + core.set_property(ov::hint::dynamic_quantization_group_size(16)); + core.set_property(ov::hint::activations_scale_factor(4.0f)); + + OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU)); + OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor)); + ASSERT_EQ(type.as(), ov::element::u8); + ASSERT_EQ(size.as(), 16); + ASSERT_EQ(scale.as(), 4.0f); +} + +TEST_F(TestPropertiesGPU, RTInfoPropertiesWithUserValuesFromCompileModel) { + ov::Core core; + ov::Any type; + ov::Any size; + ov::Any scale; + ov::CompiledModel compiled_model; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name()); + ov::AnyMap config; + config[ov::hint::kv_cache_precision.name()] = "u8"; + config[ov::hint::dynamic_quantization_group_size.name()] = "16"; + config[ov::hint::activations_scale_factor.name()] = "4.0"; + + OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config)); + OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor)); + ASSERT_EQ(type.as(), ov::element::u8); + ASSERT_EQ(size.as(), 16); + ASSERT_EQ(scale.as(), 4.0f); +} + +} // namespace From eed4a60be67dbb22825a4fad20245ae806e11634 Mon Sep 17 00:00:00 2001 From: Karol Blaszczak Date: Fri, 6 Dec 2024 10:15:28 +0100 Subject: [PATCH 05/23] [DOCS] test drive doc (#27933) A document for test drive and some minor tweaks in other areas --- .../documentation/openvino-ecosystem.rst | 9 ++ .../openvino-test-drive.rst | 109 ++++++++++++++++++ .../llm_inference_guide/genai-guide.rst | 2 +- .../benchmarks_files/llm_models_7-155H.csv | 1 + .../benchmarks_files/llm_models_7-258V.csv | 1 + .../benchmarks_files/llm_models_9-288V.csv | 3 +- .../_static/download/supported_models.csv | 1 - 7 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst diff --git a/docs/articles_en/documentation/openvino-ecosystem.rst b/docs/articles_en/documentation/openvino-ecosystem.rst index 6735192e95f674..fe4f203428a865 100644 --- a/docs/articles_en/documentation/openvino-ecosystem.rst +++ b/docs/articles_en/documentation/openvino-ecosystem.rst @@ -12,6 +12,7 @@ OpenVINO™ Ecosystem Overview :hidden: openvino-ecosystem/openvino-training-extensions + openvino-ecosystem/openvino-test-drive openvino-ecosystem/datumaro openvino-ecosystem/openvino-security-add-on @@ -102,6 +103,14 @@ development process, empowering teams to produce custom AI models at scale. |hr| +| **Intel® Test Drive** +| :bdg-link-dark:`Github ` + +OpenVINO™ Test Drive is cross-platform graphic user interface application that enables running +generative AI and vision models directly on your computer or edge device using OpenVINO™ Runtime. +|hr| + + | **Tokenizers** | :bdg-link-dark:`Github ` :bdg-link-success:`User Guide ` diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst new file mode 100644 index 00000000000000..527a01bf38a6cf --- /dev/null +++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst @@ -0,0 +1,109 @@ +=============================================================================================== +OpenVINO™ Test Drive +=============================================================================================== + + +.. meta:: + :description: See how to test your models with OpenVINO, using a simple graphic interface of + Test Drive. + + + +OpenVINO™ Test Drive is a cross-platform graphic user interface application for running and +testing AI models, both generative and vision based. +It can run directly on your computer or on edge devices using +`OpenVINO™ Runtime `__. + +OpenVINO™ Test Drive is developed under the `openvino_testdrive repository `__. + +Use OpenVINO™ Test Drive to: + +* **Chat with LLMs** and evaluate model performance on your computer or edge device; +* **Experiment with different text prompts** to generate images, using Stable + Diffusion and Stable DiffusionXL models (coming soon); +* **Transcribe speech from video**, using Whisper models, including generation + of timestamps (coming soon); +* **Run inference of models** trained by Intel® Geti™ and **visualize the results**. + + + +Installation (Windows) +############################################################################################### + +1. Download the latest archive from the + `release repository `__. + To verify the integrity of the downloaded package, use the SHA-256 file attached. + +2. Extract the zip file and run the *MSIX* installation package. Click the `Install` button to + proceed. + +3. Launch OpenVINO™ Test Drive, clicking the application name in the Windows app list. + + +Quick start +############################################################################################### + +When starting the application, you can import an LLM model from Hugging Face Hub +or upload an Intel® Geti™ model from a local drive. + +Inference of models from Hugging Face ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +1. Find a model on `Hugging Face `__ and import it. + +2. Chat with LLMs via the `Playground` tab. + +3. Use the `Performance metrics` tab to get model performance metrics on your + computer or an edge device. + + + +Inference of models trained with Intel® Geti™ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +1. Download the deployment code for a model in the OpenVINO IR format trained + by Intel® Geti™ (refer to the `Intel® Geti™ documentation `__ + for more details). + +2. Import the deployment code into OpenVINO™ Test Drive, using the *Import model* and then + *Local disk* buttons. + +3. Use the *Live inference* tab to run and visualize results of inference of individual images. + +4. For batch inference, use the *Batch inference* tab and provide paths to the folder + with input images, as well as one for batch inference results. You can do so by filling out + the *Source folder* and *Destination folder* fields. Click *Start* to start batch inference. + + +Build the Application +############################################################################################### + +1. Make sure you `Install flutter SDK `__ + and all its platform-specific dependencies. +2. Build the bindings and place them in the **./bindings** folder. + + OpenVINO™ Test Drive uses bindings to `OpenVINO™ GenAI `__ + and `OpenVINO™ Model API `__, + which are located in the **./openvino_bindings** folder. Refer to the + `GitHub page `__ + for more details. + +3. Start the application, using the following command: + + .. code-block:: console + + flutter run + +Additional Resources +############################################################################################### + +- `OpenVINO™ `__ - a software toolkit + for optimizing and deploying deep learning models. +- `GenAI Repository `__ and + `OpenVINO Tokenizers `__ + - resources and tools for developing and optimizing Generative AI applications. +- `Intel® Geti™ `__ - software for building computer + vision models. +- `OpenVINO™ Model API `__ + - a set of wrapper classes for particular tasks and model architectures. + It simplifies routine procedures, preprocessing and postprocessing of data. diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index 37b6091eb9b898..42c1c3fb47aa42 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -18,7 +18,7 @@ make sure to :doc:`install OpenVINO with GenAI <../../get-started/install-openvi .. image:: ../../assets/images/genai_main_diagram.svg :align: center - :alt: OpenVINO workflow diagram for convenience + :alt: OpenVINO GenAI workflow diagram | Here is sample code for several Generative AI use case scenarios. Note that these are very basic diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv index fa5ae359fa45c0..9481b5619244e2 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv @@ -1,3 +1,4 @@ +Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec opt-125m-gptq,INT4-MIXED,32,1116,25.8,8.1,123.5 opt-125m-gptq,INT4-MIXED,1024,1187.1,75.2,8.2,122.0 qwen2-0.5b,INT4-MIXED,32,1587.4,45.1,15.4,64.9 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv index 9aa769e4dd61b9..625ff1d6fe5ed5 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv @@ -1,3 +1,4 @@ +Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec opt-125m-gptq,INT4-MIXED,32,1150.2,35.1,8.2,122.0 opt-125m-gptq,INT4-MIXED,1024,1228,67,8.2,122.0 qwen2-0.5b,INT4-MIXED,1024,1596.2,83.6,14.4,69.4 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv index dfc98271bcd21b..c1932e678505ff 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv @@ -1,4 +1,5 @@ -opt-125m-gptq,INT4-MIXED,32,833.1,15.6,3.9,256.4 +Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec +opt-125m-gptq,INT4-MIXED,32,833.1,15.6,3.9,256.4 opt-125m-gptq,INT4-MIXED,1024,955.9,553.8,4.8,208.3 bloomz-560m,INT4-MIXED,32,1457.5,48.5,11.1,90.1 qwen2-0.5b,INT4-MIXED,32,1167.8,95.7,11.5,87.0 diff --git a/docs/sphinx_setup/_static/download/supported_models.csv b/docs/sphinx_setup/_static/download/supported_models.csv index 87ea37b0f207c3..39053fa6d3e0a7 100644 --- a/docs/sphinx_setup/_static/download/supported_models.csv +++ b/docs/sphinx_setup/_static/download/supported_models.csv @@ -715,7 +715,6 @@ tiny-random-BeitForImageClassification,Image Classification,pytorch,intel-optimu tiny-random-bert,Natural Language Processing,pytorch,intel-optimum default,+,, tiny-random-BlenderbotModel,Large Language Model,pytorch,INT4,+,, tiny-random-BloomModel,Large Language Model,pytorch,INT4,+,, -tiny-random-chatglm2,Large Language Model,pytorch,INT4,+,, tiny-random-codegen2,Large Language Model,pytorch,INT4,+,, tiny-random-CodeGenForCausalLM,Large Language Model,pytorch,INT4,+,, tiny-random-CohereForCausalLM,Large Language Model,pytorch,INT4,+,, From 0f1e5092b518402248e372c2401651a0bd150f7f Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Fri, 6 Dec 2024 11:41:12 +0100 Subject: [PATCH 06/23] [DOCS] Remove OVMS Button (#27951) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- docs/articles_en/about-openvino/performance-benchmarks.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index 5d9abfe891584f..a398432925a983 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -56,7 +56,8 @@ implemented in your solutions. Click the buttons below to see the chosen benchma :material-regular:`table_view;1.4em` LLM performance for AI PC - .. grid-item:: +.. uncomment under + .. .. grid-item:: .. button-link:: # :class: ovms-toolkit-benchmark-llm-result From c3b014c49afa04a838e1778184cf97a1c834e465 Mon Sep 17 00:00:00 2001 From: Tomasz Jankowski Date: Fri, 6 Dec 2024 11:55:16 +0100 Subject: [PATCH 07/23] [Templ test] GroupNormalization: Enable whole Tensor comparison (#27932) ### Details: - Removed legacy comparison method. - Set relative threshold for fp16. ### Tickets: - CVS-137168 Signed-off-by: Tomasz Jankowski --- .../tests/functional/op_reference/group_normalization.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/plugins/template/tests/functional/op_reference/group_normalization.cpp b/src/plugins/template/tests/functional/op_reference/group_normalization.cpp index 322d509aa838ec..b3bd898db4eeec 100644 --- a/src/plugins/template/tests/functional/op_reference/group_normalization.cpp +++ b/src/plugins/template/tests/functional/op_reference/group_normalization.cpp @@ -42,11 +42,14 @@ class ReferenceGroupNormalization : public testing::TestWithParam& obj) { From bf62609711227605d381bedfcd993e6c60475975 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Fri, 6 Dec 2024 23:51:58 +0900 Subject: [PATCH 08/23] [GPU] MLP : 2fcs + swiglu fusion (#27831) ### Details: - 2 FCs + swiglu in MLP pattern are fused - Only applied to cldnn && #EUs > 128 && glu type with swiglu ### Tickets: - 152163 --- .../intel_gpu/runtime/debug_configuration.hpp | 1 + .../include/intel_gpu/runtime/layout.hpp | 5 + .../intel_gpu/src/graph/fully_connected.cpp | 26 +++- .../prepare_primitive_fusing.cpp | 62 +++++++++- .../src/graph/impls/ocl/fully_connected.cpp | 14 ++- .../impls/ocl/kernel_selector_helper.cpp | 10 +- .../src/graph/include/pass_manager.h | 1 + .../intel_gpu/src/graph/include/swiglu_inst.h | 9 ++ .../intel_gpu/src/graph/primitive_inst.cpp | 11 ++ .../intel_gpu/src/graph/program_node.cpp | 22 ++++ .../fully_connected_gpu_bf_tiled.cl | 117 ++++++++++++++++-- .../fully_connected_gpu_bf_tiled_common.cl | 49 +++++++- .../fully_connected_kernel_bf_tiled.cpp | 65 +++++++--- .../fully_connected_kernel_bf_tiled.h | 3 +- .../kernels/swiglu/swiglu_kernel_base.h | 11 ++ .../intel_gpu/src/plugin/ops/swiglu.cpp | 4 +- .../transformations/fc_horizontal_fusion.cpp | 19 ++- .../transformations/fc_horizontal_fusion.hpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 10 +- .../src/runtime/debug_configuration.cpp | 5 +- .../fusions/fully_connected_fusion_test.cpp | 59 ++++++++- .../tests/unit/fusions/fusion_test_common.hpp | 12 ++ 22 files changed, 469 insertions(+), 48 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index a020c5d1cd5ef6..a7a8ae1f229a72 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -147,6 +147,7 @@ class debug_configuration { int use_kv_cache_compression; // Enable KV-cache compression int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size int disable_horizontal_fc_fusion; // Disable fc horizontal fusion + int disable_fc_swiglu_fusion; // Disable swiglu fusion to fc std::set dump_iteration; // Dump n-th execution of network. std::vector load_layers_raw_dump; // List of layers to load dumped raw binary and filenames static const debug_configuration *get_instance(); diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp index ab5cb53454b768..cc753d10aea9cd 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp @@ -50,6 +50,11 @@ struct data_type_traits { return et.is_quantized() && et.bitwidth() == 8; } + static bool is_i4_u4(data_types data_type) { + auto et = ov::element::Type(data_type); + return et.bitwidth() == 4; + } + static ov::element::Type max_type(ov::element::Type t1, ov::element::Type t2) { if (t1.bitwidth() < t2.bitwidth()) return t2; diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index bc1e3e2e82b3ca..308d9a9f2fd66b 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -7,8 +7,10 @@ #include #include #include "utils.hpp" +#include "swiglu_inst.h" #include "matmul_shape_inference.hpp" +#include "glu_shape_inference.hpp" namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(fully_connected) @@ -171,14 +173,32 @@ std::vector fully_connected_inst::calc_output_layouts(fully_connected_no output_type = impl_param.get_output_element_type(); } - ov::op::v0::MatMul op; - op.set_transpose_b(true); + ov::op::v0::MatMul matmul_op; + matmul_op.set_transpose_b(true); std::vector input_shapes = { input_layout.get(), weights_layout.get() }; - std::vector output_shapes = ov::op::v0::shape_infer(&op, input_shapes); + std::vector output_shapes = ov::op::v0::shape_infer(&matmul_op, input_shapes); + bool has_swiglu = false; + auto& fused_prims = node.get_fused_primitives(); + for (auto f : fused_prims) { + if (f.is_type()) { + has_swiglu = true; + OPENVINO_ASSERT(fused_prims.size() == 1, "Other operation is fused in addition to swiglu!"); + } + } + if (has_swiglu) { + ov::op::internal::GLU swiglu_op; + OPENVINO_ASSERT(fused_prims.size() == 1); + OPENVINO_ASSERT(fused_prims[0].typed_desc()->glu_type == ov::op::internal::GLU::GluType::Swish); + swiglu_op.set_axis(fused_prims[0].typed_desc()->axis); + swiglu_op.set_split_lengths(fused_prims[0].typed_desc()->split_lengths); + swiglu_op.set_glu_type(fused_prims[0].typed_desc()->glu_type); + std::vector input_shapes = { output_shapes[0] }; + output_shapes = shape_infer(&swiglu_op, input_shapes); + } bool is_static = input_layout.is_static() && weights_layout.is_static(); bool allow_new_shape_infer = impl_param.get_program().is_new_shape_infer(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 60d1e8aa7e10b7..29b7cf58a19b54 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // - +#include "intel_gpu/runtime/debug_configuration.hpp" #include "program_helpers.h" #include "pass_manager.h" @@ -37,6 +37,7 @@ #include "strided_slice_inst.h" #include "cum_sum_inst.h" #include "embedding_bag_inst.h" +#include "swiglu_inst.h" #include "extract_image_patches_inst.h" #include "reduce_inst.h" #include "group_normalization_inst.h" @@ -56,6 +57,7 @@ using namespace cldnn; void prepare_primitive_fusing::run(program& p) { fuse_reorders(p); remove_redundant_reshape(p); + fuse_swiglu(p); fuse_bias(p); fuse_simple_primitives(p); fuse_constant_transposes(p); @@ -161,6 +163,46 @@ void prepare_primitive_fusing::fuse_reorders(program &p) { } } +void prepare_primitive_fusing::fuse_swiglu(program &p) { + GPU_DEBUG_GET_INSTANCE(debug_config); + bool disable_fc_swiglu_fusion = false; + GPU_DEBUG_IF(debug_config->disable_fc_swiglu_fusion == 1) + disable_fc_swiglu_fusion = true; + // Apply only for high performant GPU + if (disable_fc_swiglu_fusion || p.get_engine().get_device_info().execution_units_count < 128) + return; + // TODO: to support other glu types && other weight data types + auto itr = p.get_processing_order().begin(); + std::map>> fusing_history; + while (itr != p.get_processing_order().end()) { + auto node_itr = itr++; + auto& node = (*node_itr); + if (node->is_type()) { + if (!node->get_dependency(0).is_type()) + continue; + auto swiglu_prim = node->get_kernel_impl_params()->typed_desc(); + auto& fc_node = node->get_dependency(0); + if (node->get_dependencies().size() > 1) + continue; + if (!node->get_dependency(0).get_fused_primitives().empty()) + continue; + auto in_dt = fc_node.get_input_layout(0).data_type; + if (in_dt != data_types::f16) + continue; + auto wt_dt = fc_node.get_input_layout(1).data_type; + if (!data_type_traits::is_i4_u4(wt_dt)) + continue; + if (swiglu_prim->glu_type != ov::op::internal::GLU::GluType::Swish || + !(swiglu_prim->axis == -1 || swiglu_prim->axis == static_cast(node->get_output_layout(0).get_partial_shape().size()) - 1)) + continue; + GPU_DEBUG_TRACE_DETAIL << node->id() << " : fuse swiglu to " << fc_node.id() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - split axis : " << swiglu_prim->axis << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - split length : " << swiglu_prim->split_lengths << std::endl; + p.fuse_nodes(fc_node, *node, &fusing_history); + } + } +} + void prepare_primitive_fusing::fuse_bias(program &p) { auto itr = p.get_processing_order().begin(); while (itr != p.get_processing_order().end()) { @@ -188,6 +230,17 @@ void prepare_primitive_fusing::fuse_bias(program &p) { if (!is_bias_add) continue; + for (auto& dep : eltw_node.get_dependencies()) { + auto& fused_prims = dep.first->get_fused_primitives(); + if (std::any_of(fused_prims.begin(), fused_prims.end(), [](const fused_primitive_desc& f_desc) { + return f_desc.is_type(); + })) { + GPU_DEBUG_TRACE_DETAIL << "Skip fusing " << eltw_node.id() << " to " << dep.first->id() << " because " + << dep.first->id() << " has fused swiglu." << std::endl; + continue; + } + } + auto is_3d_fully_connected = [](program_node& node) { if (!node.is_type()) return false; @@ -491,6 +544,13 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { }; auto fc_supports_fusings = [&](fully_connected_node& node) -> bool { + auto& fused_prims = node.get_fused_primitives(); + if (std::any_of(fused_prims.begin(), fused_prims.end(), [](const fused_primitive_desc& f_desc) { + return f_desc.is_type(); + })) { + GPU_DEBUG_TRACE_DETAIL << node.id() << " has fused swiglu. Skip fusing more primitives" << std::endl; + return false; + } if (lo.has_all_enabled_onednn_impls_optimization_attribute() && lo.get_preferred_impl_type(node, format::any /*dummy*/) == impl_types::onednn) { return true; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp index 04f691c2bd2ca9..110444c2c6255c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp @@ -132,15 +132,16 @@ struct fully_connected_impl : typed_primitive_impl_ocl { return layouts; }; - auto get_fc_output_layout = [primitive](const std::vector& input_layouts, const layout& output_layout) { + auto get_fc_output_layout = [primitive](const std::vector& input_layouts, const layout& output_layout, bool swiglu_fused) { auto updated_out_layout = output_layout; auto input0_pshape = input_layouts[0].get_partial_shape(); auto input1_pshape = input_layouts[1].get_partial_shape(); ov::PartialShape updated_out_pshape {input0_pshape[0], input1_pshape[0]}; + const auto output_feature_size = swiglu_fused ? input1_pshape[0] / 2 : input1_pshape[0]; if (primitive->input_size == 3) { - updated_out_pshape = { input0_pshape[0], input0_pshape[1], input1_pshape[0] }; + updated_out_pshape = { input0_pshape[0], input0_pshape[1], output_feature_size}; } updated_out_layout.set_partial_shape(updated_out_pshape); @@ -149,6 +150,13 @@ struct fully_connected_impl : typed_primitive_impl_ocl { bool allow_new_shape_infer = impl_param.get_program().is_new_shape_infer(); auto updated_impl_param = impl_param; + bool swiglu_fused = false; + if (updated_impl_param.fused_desc.size() > 0) { + for (const auto& f : updated_impl_param.fused_desc) { + if (f.is_type()) + swiglu_fused = true; + } + } const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts, allow_new_shape_infer); for (size_t i = 0; i < input_layouts.size(); ++i) { @@ -156,7 +164,7 @@ struct fully_connected_impl : typed_primitive_impl_ocl { } updated_impl_param.weights_layout = input_layouts[1]; - updated_impl_param.output_layouts[0] = get_fc_output_layout(input_layouts, impl_param.get_output_layout()); + updated_impl_param.output_layouts[0] = get_fc_output_layout(input_layouts, impl_param.get_output_layout(), swiglu_fused); return updated_impl_param; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 0a999a5a124d3b..42d83a0265d290 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -32,11 +32,13 @@ #include "intel_gpu/primitives/embedding_bag.hpp" #include "intel_gpu/primitives/extract_image_patches.hpp" +#include "swiglu_inst.h" #include "activation_inst.h" #include "eltwise_inst.h" #include "quantize_inst.h" #include "reorder_inst.h" +#include "kernel_selector/kernels/swiglu/swiglu_kernel_base.h" #include "kernel_selector/kernels/activation/activation_kernel_base.h" #include "kernel_selector/kernels/depth_to_space/depth_to_space_kernel_base.h" #include "kernel_selector/kernels/eltwise/eltwise_kernel_base.h" @@ -1009,7 +1011,13 @@ kernel_selector::activation_function get_kernel_selector_activation_param(activa } std::shared_ptr convert_fuse_params(std::shared_ptr p) { - if (p->type() == activation::type_id()) { + if (p->type() == swiglu::type_id()) { + auto casted = std::dynamic_pointer_cast(p); + auto axis = casted->_desc->axis; + auto split_length = casted->_desc->split_lengths; + auto split_to_glu_idx = casted->_desc->split_to_glu_idx; + return std::make_shared(axis, split_length, split_to_glu_idx); + } else if (p->type() == activation::type_id()) { auto casted = std::dynamic_pointer_cast(p); auto desc = casted->_desc; kernel_selector::base_activation_params p; diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 61c34c0eff548f..490076a37f788e 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -140,6 +140,7 @@ class prepare_primitive_fusing : public base_pass { private: void run(program& p) override; void fuse_bias(program &p); + void fuse_swiglu(program &p); void fuse_reorders(program& p); void fuse_simple_primitives(program &p); void fuse_constant_transposes(program &p); diff --git a/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h b/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h index 6a5ce08dc54bd2..755e9ab33c2db6 100644 --- a/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h @@ -10,6 +10,11 @@ namespace cldnn { +class SwigluFuseParams : public NodeFuseParams { +public: + SwigluFuseParams(std::shared_ptr desc) : NodeFuseParams(swiglu::type_id()), _desc(std::move(desc)) {} + std::shared_ptr _desc; +}; template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; @@ -19,6 +24,10 @@ struct typed_program_node : public typed_program_node_base { program_node& input(size_t index = 0) const { return get_dependency(index); } std::vector get_shape_infer_dependencies() const override { return {}; } + + std::shared_ptr get_fuse_params() const override { + return std::make_shared(typed_desc()); + } }; using swiglu_node = typed_program_node; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 5680eedcb8f87c..0737362405ff9c 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -38,6 +38,7 @@ #include "gather_inst.h" #include "broadcast_inst.h" #include "dynamic_quantize_inst.h" +#include "swiglu_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "impls/registry/implementation_manager.hpp" #include "impls/registry/registry.hpp" @@ -2606,6 +2607,16 @@ bool primitive_inst::is_valid_fusion() const { } else { if (fd.is_type() || fd.is_type()) continue; + if (fd.is_type()) { + OPENVINO_ASSERT(_node->is_type() && _node->get_preferred_impl_type() == impl_types::ocl); + if (!_node->get_selected_impl()) + return false; + // TODO : support ref kernel too + if (_node->get_selected_impl()->get_kernel_name().find("fully_connected_gpu_bf_tiled") != std::string::npos) + return true; + else + return false; + } OPENVINO_THROW("[GPU] Unsupported fused operation in dynamic shape: type=", fd.desc->type_string(), ", id=", fd.desc->id); } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 201fa3a155caa9..5161887b79e57a 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -10,6 +10,7 @@ #include "activation_inst.h" #include "reorder_inst.h" #include "quantize_inst.h" +#include "swiglu_inst.h" #include "intel_gpu/runtime/debug_configuration.hpp" #ifdef ENABLE_ONEDNN_FOR_GPU #include "convolution_inst.h" @@ -770,6 +771,15 @@ void program_node::save(cldnn::BinaryOutputBuffer& ob) const { ob << casted->_out_hi; ob << casted->_out_scale; ob << casted->_out_shift; + } else if (f_desc.f_param->type() == swiglu::type_id()) { + auto casted = std::dynamic_pointer_cast(f_desc.f_param); + if (get_program().has_node(casted->_desc->id)) { + ob << true; + ob << casted->_desc->id; + } else { + ob << false; + ob << casted->_desc; + } } ob << f_desc.deps.size(); @@ -975,6 +985,18 @@ void program_node::load(cldnn::BinaryInputBuffer& ib) { need_pre_shift, need_clamp, need_min_clamp, need_max_clamp, per_tensor_input_range, per_tensor_input_scale, per_tensor_input_shift, per_tensor_output_range, per_tensor_output_scale, per_tensor_output_shift, in_lo, in_hi, in_scale, in_shift, out_lo, out_hi, out_scale, out_shift); + } else if (f_param_type == swiglu::type_id()) { + ib >> exist_prim; + std::shared_ptr param_desc; + if (exist_prim) { + primitive_id desc_id; + ib >> desc_id; + param_desc = std::dynamic_pointer_cast(get_program().get_node_ptr(desc_id)->desc); + } else { + ib >> param_desc; + } + f_desc.f_param = std::make_shared(param_desc); + } else { f_desc.f_param = std::make_shared(f_param_type); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 201b59c160cf27..01c8e8853e350d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -95,6 +95,12 @@ KERNEL(quantize_input)( # error "fully_connected_gpu_bf_tiled.cl - TILE_K must be one of {1, 2, 4}" # endif #endif + +#ifdef SWIGLU_LENGTH +# if OUTER_OFM != 2 +# error "fully_connected_gpu_bf_tiled.cl - outer_ofm should be 2 when swiglu is fused" +# endif +#endif #if TILE_K == 4 && COMPRESSED_WEIGHTS_INT4 && FILTER_LAYOUT_OS_IS_YX_OSV32_ISV2 // Data stored in memory : f0k0k1|f16k0k1|f0k2k3|f16k2k3 // => unpack as f0k0k1|f0k2k3|f16k0k1|f16k2k3 so that the weight access order is preserved @@ -210,14 +216,27 @@ inline void FUNC(fc_bf_tiled_kernel_default)( // full dispatch pipeline. uint feature_mini_block = gid % DISPATCH_FSV; uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV; + #ifdef SWIGLU_LENGTH + uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + #else uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV); uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV); + #endif #if USE_SLM + #ifdef SWIGLU_LENGTH + uint out_f = gid * (TILE_OFM * SIMD); + #else uint out_f = gid * (OUTER_OFM * TILE_OFM * SIMD); + #endif uint out_b = LWS_BATCHES * TILE_B * (uint)get_group_id(2) + local_id * TILE_B; #else + #ifdef SWIGLU_LENGTH + uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD); + #else uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (OUTER_OFM * TILE_OFM * SIMD); + #endif uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * TILE_B); #endif @@ -299,9 +318,20 @@ inline void FUNC(fc_bf_tiled_kernel_default)( ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); #endif + ACTIVATION_VEC_TYPE activated[TILE_B] = { }; #if OUTER_OFM > 1 uint input_offset_init = input_offset; - unroll_for (uint oi = 0; oi < OUTER_OFM; ++oi) { + uint weights_offset_init = weights_offset; + uint out_f_init = out_f; + __attribute__((opencl_unroll_hint(1))) + for (uint oi = 0; oi < OUTER_OFM; ++oi) { + input_offset = input_offset_init; + #ifdef SWIGLU_LENGTH + weights_offset = weights_offset_init + oi * (FILTER_IFM_NUM / (TILE_K_OFM / TILE_K_OFM_PACKED) ) * SWIGLU_LENGTH; + out_f += SWIGLU_LENGTH * oi; + #else + out_f += TILE_OFM * SIMD * oi; + #endif #endif #if REALIGN_FP16_OFFSET @@ -669,14 +699,38 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #endif // MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0 // ===================================================================================================================================== // Post-processing: bias, activation, fused-ops - ACTIVATION_VEC_TYPE activated[TILE_B] = { }; - for (uint bi = 0; bi < TILE_B; ++bi) { + unroll_for (uint bi = 0; bi < TILE_B; ++bi) { + #ifdef SWIGLU_LENGTH + #if SWIGLU_SPLIT_TO_GLU_IDX == 0 + if (oi == 0) { + // swish + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + activated[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * activated[bi]))); + } else { + activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + #else + if (oi == 0) { + // swish + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + } else { + acc[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * acc[bi]))); + activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + #endif + #else activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + #endif #if OUTER_OFM > 1 acc[bi] = 0; #endif } +#if OUTER_OFM > 1 && defined(SWIGLU_LENGTH) + } + out_f = out_f_init; +#endif + #if BIAS_TERM #if TILE_OUT_F_NUM % (OUTER_OFM * TILE_OFM * SIMD) == 0 BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f); @@ -746,9 +800,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)( output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; } } -#if OUTER_OFM > 1 - out_f += TILE_OFM * SIMD; - input_offset = input_offset_init; +#if OUTER_OFM > 1 && !defined(SWIGLU_LENGTH) } #endif // ===================================================================================================================================== @@ -816,8 +868,14 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // full dispatch pipeline. uint feature_mini_block = gid % DISPATCH_FSV; uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV; + #ifdef SWIGLU_LENGTH uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + #else + uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV); + uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV); + #endif + FILTER_VEC_TYPE wei = 0; @@ -895,6 +953,22 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); #endif + ACTIVATION_VEC_TYPE activated[TILE_B] = { }; +#if OUTER_OFM > 1 + uint input_offset_init = input_offset; + uint weights_offset_init = weights_offset; + uint out_f_init = out_f; + __attribute__((opencl_unroll_hint(1))) + for (uint oi = 0; oi < OUTER_OFM; ++oi) { + input_offset = input_offset_init; + #ifdef SWIGLU_LENGTH + weights_offset = weights_offset_init + oi * (FILTER_IFM_NUM / (TILE_K_OFM / TILE_K_OFM_PACKED) ) * SWIGLU_LENGTH; + out_f += SWIGLU_LENGTH * oi; + #else + out_f += TILE_OFM * SIMD * oi; + #endif +#endif + // ===================================================================================================================================== // Main computation loop const uint iterations = MAIN_LOOP_ELEMENTS_COUNT / TILE_IFM_ELEMENTS_SIZE; // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD) @@ -1164,11 +1238,37 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // ===================================================================================================================================== // Post-processing: bias, activation, fused-ops - ACTIVATION_VEC_TYPE activated[TILE_B] = { }; for (uint bi = 0; bi < TILE_B; ++bi) { + #ifdef SWIGLU_LENGTH + #if SWIGLU_SPLIT_TO_GLU_IDX == 0 + if (oi == 0) { + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + activated[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * activated[bi]))); + } else { + activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + #else + if (oi == 0) { + // swish + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + } else { + acc[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * acc[bi]))); + activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + #endif + #else activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + #endif +#if OUTER_OFM > 1 + acc[bi] = 0; +#endif } +#if OUTER_OFM > 1 && defined(SWIGLU_LENGTH) + } + out_f = out_f_init; +#endif + #if BIAS_TERM #if TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f); @@ -1240,6 +1340,9 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; } } +#if OUTER_OFM > 1 && !defined(SWIGLU_LENGTH) + } +#endif // ===================================================================================================================================== } #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl index ddffa87b202816..ca5c1ea3646d02 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl @@ -25,7 +25,6 @@ inline void (FUNC_NAME)( ) { uint gid = (uint)get_group_id(0); uint sglid = (uint)get_sub_group_local_id(); - // Dispatch as bs_fs_bsv_fsv, where bsv = DISPATCH_BSV and fsv = DISPATCH_FSV. // This allows more fine grained control over dispatch order than using work-groups and // avoids requirement of threads being available for whole work-group. @@ -33,10 +32,19 @@ inline void (FUNC_NAME)( // full dispatch pipeline. uint feature_mini_block = gid % DISPATCH_FSV; uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV; + #ifdef SWIGLU_LENGTH + uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + #else uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV); uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV); + #endif + #ifdef SWIGLU_LENGTH + uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD); + #else uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (OUTER_OFM * TILE_OFM * SIMD); + #endif uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * FORCED_TILE_B); ACCUMULATOR_VEC_TYPE acc[FORCED_TILE_B] = { }; @@ -90,9 +98,19 @@ inline void (FUNC_NAME)( ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); #endif + ACTIVATION_VEC_TYPE activated[FORCED_TILE_B] = { }; #if OUTER_OFM > 1 uint input_offset_init = input_offset; + uint weights_offset_init = weights_offset; + uint out_f_init = out_f; unroll_for (uint oi = 0; oi < OUTER_OFM; ++oi) { + input_offset = input_offset_init; + #ifdef SWIGLU_LENGTH + weights_offset = weights_offset_init + oi * (FILTER_IFM_NUM / (TILE_K_OFM / TILE_K_OFM_PACKED) ) * SWIGLU_LENGTH; + out_f += SWIGLU_LENGTH * oi; + #else + out_f += TILE_OFM * SIMD * oi; + #endif #endif #if REALIGN_FP16_OFFSET @@ -297,14 +315,37 @@ inline void (FUNC_NAME)( #endif // MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0 // ===================================================================================================================================== // Post-processing: bias, activation, fused-ops - ACTIVATION_VEC_TYPE activated[FORCED_TILE_B] = { }; for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + #ifdef SWIGLU_LENGTH + #if SWIGLU_SPLIT_TO_GLU_IDX == 0 + if (oi == 0) { + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + activated[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * activated[bi]))); + } else { + activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + #else + if (oi == 0) { + // swish + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + } else { + acc[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * acc[bi]))); + activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + #endif + #else activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + #endif #if OUTER_OFM > 1 acc[bi] = 0; #endif } +#if OUTER_OFM > 1 && defined(SWIGLU_LENGTH) + } + out_f = out_f_init; +#endif + #if BIAS_TERM #if TILE_OUT_F_NUM % (OUTER_OFM * TILE_OFM * SIMD) == 0 BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f); @@ -396,9 +437,7 @@ inline void (FUNC_NAME)( output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; } } -#if OUTER_OFM > 1 - out_f += TILE_OFM * SIMD; - input_offset = input_offset_init; +#if OUTER_OFM > 1 && !defined(SWIGLU_LENGTH) } #endif // ===================================================================================================================================== diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 02304512637783..46e8f7f1104f0d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -4,6 +4,7 @@ #include "fully_connected_kernel_bf_tiled.h" #include "kernel_selector_utils.h" +#include "swiglu/swiglu_kernel_base.h" #include #include #include "common_types.h" @@ -163,7 +164,21 @@ static bool is_weight_small_kn(const fully_connected_params& params, size_t outp return output_f / 2 /*most frequently used tile_ofm*/ <= min_num_threads; } +static bool is_swiglu_fused(const fully_connected_params& params) { + bool swiglu_fused = false; + if (!params.fused_ops.empty()) { + for (auto p : params.fused_ops) { + if (p.GetType() == kernel_selector::KernelType::SWIGLU) + swiglu_fused = true; + } + } + if (swiglu_fused) + OPENVINO_ASSERT(params.fused_ops.size() == 1); + return swiglu_fused; +} static bool is_suitable_outer_ofm(const fully_connected_params& params, size_t output_f) { + if (is_swiglu_fused(params)) + return true; size_t min_num_threads = params.engineInfo.computeUnitsCount * simd; return (params.weights.OFM().v > params.weights.IFM().v * 6 && output_f / 8 /* tile_ofm=4 and outer_ofm=2 */ > min_num_threads * 1.5); @@ -406,6 +421,8 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, while (max_tile_ofm * 2 * simd <= output_f && max_tile_ofm < 4) max_tile_ofm *= 2; + bool swiglu_fused = is_swiglu_fused(params); + if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4 || (is_weight_dyn_quantizable(params) && should_dynamic_quantize(params))) { // Only 4bit weight type is fully optimized to use SLM. In default kernel, SLM is not applied to 8bit weight. @@ -426,30 +443,39 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT)); } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { - selector.Case(tune_params(1, 4, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT)) - .Case(tune_params(1, 4, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + // Here : b1 static + if (swiglu_fused) { + return selector.Default(tune_params(1, 4, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT)); + } else { + selector.Case(tune_params(1, 4, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT)) + .Case(tune_params(1, 4, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + } } else { - return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + if (swiglu_fused) { + return selector.Default(tune_params(1, 2, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT)); + } else { + return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + } } } } else { // Try to use SLM kernels if possible + unsigned int forced_outer_ofm = swiglu_fused ? 2 : 1; if (preferred_kernel_type != KernelType::DEFAULT) { if (params.is_shape_agnostic && !should_dynamic_quantize(params)) { - selector.Case(tune_params(16, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) - .Case(tune_params(16, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); + selector.Case(tune_params(16, 2, 2, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) + .Case(tune_params(16, 2, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); } - - selector.Case(tune_params(8, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) - .Case(tune_params(8, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); + selector.Case(tune_params(8, 2, 2, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) + .Case(tune_params(8, 2, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); } if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) - return selector.Default(tune_params(8, 1, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT)); + return selector.Default(tune_params(8, 1, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT)); else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) - return selector.Default(tune_params(8, 4, 1, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + return selector.Default(tune_params(8, 4, 1, 2, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT)); else - return selector.Default(tune_params(8, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT)); + return selector.Default(tune_params(8, 2, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT)); } } else if (params.compressed && params.engineInfo.supports_immad) { return selector.Default(tune_params(1, 1, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT)); @@ -526,8 +552,12 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au kernel_type = kernel_number == 0 ? KernelType::DEFAULT : KernelType::SLM; auto tparams = GetAutoTuneParams(params, kernel_type, autoTuneIndex); + std::pair threads; + if (is_swiglu_fused(params)) + threads = get_output_aligned_bf_size(params, true, tparams.tile_b, tparams.tile_ofm * simd); + else + threads = get_output_aligned_bf_size(params, true, tparams.tile_b, tparams.tile_ofm * tparams.outer_ofm * simd); - auto threads = get_output_aligned_bf_size(params, true, tparams.tile_b, tparams.tile_ofm * tparams.outer_ofm * simd); auto batch_threads = threads.first; auto feature_threads = threads.second; @@ -575,6 +605,13 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para size_t tile_k_ofm_packed = tile_k_ofm; size_t quantize_grp_size = get_dynamic_quantize_group_size(params); + if (is_swiglu_fused(params)) { + auto split_length = params.fused_ops[0].GetOpParams()->split_length; + auto split_to_glu_idx = params.fused_ops[0].GetOpParams()->split_to_glu_idx; + jit.AddConstant(MakeJitConstant("SWIGLU_LENGTH", split_length)); + jit.AddConstant(MakeJitConstant("SWIGLU_SPLIT_TO_GLU_IDX", split_to_glu_idx)); + } + bool add_decompress_scale_post_op = false; WeightsType weights_dt = params.weights.GetDType(); if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) { @@ -723,7 +760,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)")); } - if (!params.fused_ops.empty()) { + if (!params.fused_ops.empty() && !is_swiglu_fused(params)) { std::vector idx_order_scalar = { "(out_b + bi)", "(out_f + sglid)", "0", "0" }; std::vector idx_order_vec = { "(out_b + bi)", "(out_f + sglid + fi * SIMD)", "0", "0" }; if (params.outputs[0].GetLayout() == DataLayout::bfyx) { @@ -828,7 +865,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa auto output_f = get_output_aligned_bf_size(fc_params, false).second; WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16; - if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 + if (!is_swiglu_fused(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) && is_weight_horizontal(fc_params, output_f)) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h index cbbf52adf344ce..1093c7377bf76f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h @@ -76,7 +76,8 @@ class FullyConnected_bf_tiled : public FullyConnectedKernelBase { std::vector GetSupportedFusedOps() const override { return { FusedOpType::ACTIVATION, FusedOpType::ELTWISE, - FusedOpType::QUANTIZE }; + FusedOpType::QUANTIZE, + FusedOpType::SWIGLU }; } JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override; bool Validate(const Params& params) const override; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h index 2f5c046690f78d..bb5625ba087a2d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h @@ -21,6 +21,17 @@ struct swiglu_params : public base_params { int32_t split_to_glu_idx; }; +struct swiglu_fuse_params : fuse_params { + explicit swiglu_fuse_params(int32_t axis, size_t split_lengths, size_t split_to_glu_idx) + : fuse_params(KernelType::SWIGLU), + axis(axis), + split_length(split_lengths), + split_to_glu_idx(split_to_glu_idx) {} + int32_t axis; + size_t split_length; + size_t split_to_glu_idx; +}; + class SwiGLUKernelBase : public KernelBaseOpenCL { public: using KernelBaseOpenCL::KernelBaseOpenCL; diff --git a/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp b/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp index 5df2cafd41a41f..23b44dcc1a4677 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp @@ -21,7 +21,7 @@ static void CreateGLUOp(ProgramBuilder& p, const std::shared_ptr& op) { if (p.use_new_shape_infer()) { auto prim = cldnn::swiglu(primitive_name, inputs[0], - op->get_axis(), + (op->get_axis() < 0 ? op->get_input_partial_shape(0).size() + op->get_axis() : op->get_axis()), op->get_split_lengths(), op->get_glu_type(), op->get_split_to_glu_idx(), @@ -31,7 +31,7 @@ static void CreateGLUOp(ProgramBuilder& p, const std::shared_ptr& op) { } else { auto prim = cldnn::swiglu(primitive_name, inputs[0], - op->get_axis(), + (op->get_axis() < 0 ? op->get_input_partial_shape(0).size() + op->get_axis() : op->get_axis()), op->get_split_lengths(), op->get_glu_type(), op->get_split_to_glu_idx(), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp index fcb339531c1883..327de1424c34c9 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp @@ -18,16 +18,25 @@ namespace ov { namespace intel_gpu { -FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() { +FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swiglu) { using namespace ov::pass::pattern; - auto is_target_pattern = [](const Output& output) { + GPU_DEBUG_GET_INSTANCE(debug_config); + // Three FCs connected to the same input + size_t min_num_fcs_to_fuse = 3; + // Note: + // For cldnn, two fcs in mlp will be fused at horizontal fc fusion, and then swiglu will be fused at prepare_primitive_fusion + // i.e., eltwise((fc + swish), fc) => fused_fc + swiglu => fused_fc_swilgu + // Onednn gemms are to be handled in a different way (TBD) + if (fuse_mlp_swiglu) + min_num_fcs_to_fuse = 2; + auto is_target_pattern = [min_num_fcs_to_fuse](const Output& output) { + const int max_num_fcs_to_fuse = 3; // Currently this pass targets only compressed FCs (QKV) on dynamic generative models // inputs: input, weight, bias, scale, [zp] // Bias/scale/zp are constant or none // if it is not constant, the only allowed cases are Constant => convert // All FCs have same # of valid inputs (e.g., if one of the fc has zp, all fcs have zp) - auto is_constant = [](const std::shared_ptr node) { if (std::dynamic_pointer_cast(node)) return true; @@ -40,9 +49,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() { auto is_placeholder = [](const std::shared_ptr node) { return std::dynamic_pointer_cast(node); }; - // Three FCs connected to the same input - const int min_num_fcs_to_fuse = 3; - const int max_num_fcs_to_fuse = 3; + const auto& fc = std::dynamic_pointer_cast(output.get_node_shared_ptr()); const auto& input = fc->get_input_node_shared_ptr(0); if (!fc->get_input_partial_shape(0).is_dynamic()) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp index b6a852354bad8d..67abaa3df54357 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp @@ -12,7 +12,7 @@ namespace intel_gpu { class FullyConnectedHorizontalFusion: public ov::pass::MatcherPass { public: OPENVINO_RTTI("FullyConnectedHorizontalFusion", "0"); - FullyConnectedHorizontalFusion(); + FullyConnectedHorizontalFusion(bool fuse_mlp_swiglu = false); }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index fcb88560944854..e47ccbb09a9c43 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -913,12 +913,18 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); bool disable_horizontal_fc_fusion = false; + bool disable_fc_swiglu_fusion = false; GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->disable_horizontal_fc_fusion == 1) disable_horizontal_fc_fusion = true; - + GPU_DEBUG_IF(debug_config->disable_fc_swiglu_fusion == 1) + disable_fc_swiglu_fusion = true; + // mlp fusion is only supported for cldnn on high performant GPUis + bool fuse_mlp_swiglu = !device_info.supports_immad && + device_info.execution_units_count >= 128 && + !disable_fc_swiglu_fusion; if (!disable_horizontal_fc_fusion) - manager.register_pass(); + manager.register_pass(fuse_mlp_swiglu); // ZP should not be folded for FC. But still, ZP should be folded for Gather. // Therefore, run MarkDequantizationSubgraph again to fold ZP constant. diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 4a68355e1bc8ba..65ca31f16c720c 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -191,6 +191,7 @@ static void print_help_messages() { message_list.emplace_back("OV_GPU_DynamicQuantizeGroupSize", "Specify a group size of dynamic quantization to enable " "dynamic quantization for Fully-connected primitive."); message_list.emplace_back("OV_GPU_DisableHorizontalFCFusion", "Disable horizontal fc fusion"); + message_list.emplace_back("OV_GPU_DisableFCSwigluFusion", "Disable fc + swiglu fusion"); message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space."); message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in " "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), " @@ -259,7 +260,8 @@ debug_configuration::debug_configuration() , use_usm_host(0) , use_kv_cache_compression(-1) , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) - , disable_horizontal_fc_fusion(0) { + , disable_horizontal_fc_fusion(0) + , disable_fc_swiglu_fusion(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); @@ -314,6 +316,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression); get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size); get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion); + get_gpu_debug_env_var("DisableFCSwigluFusion", disable_fc_swiglu_fusion); std::string dump_iteration_str; get_gpu_debug_env_var("DumpIteration", dump_iteration_str); std::string mem_preallocation_params_str; diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp index 5e9b5134fb3802..09e164742f3fd9 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -73,7 +74,7 @@ class FullyConnectedFusingTest : public ::BaseFusingTest{ + fully_connected_test_params{ CASE_FC_FP16_INT4_SWIGLU_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP16_INT4_SWIGLU_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP16_INT4_SWIGLU_3, 2, 3 }, +})); + class fc_imad_int8_eltwise_add_ocl_dynamic : public FullyConnectedFusingTest { public: void run_test() { diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp index eb0f63c651e50d..c469925083b775 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp @@ -147,6 +147,12 @@ class BaseFusingTest : public ::testing::TestWithParam { } else if (l.data_type == data_types::i8) { VF rnd_vec(s.count(), static_cast(fill_value)); set_values(prim, rnd_vec); + } else if (l.data_type == data_types::u4) { + VF rnd_vec(s.count()/2, static_cast(fill_value)); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::i4) { + VF rnd_vec(s.count()/2, static_cast(fill_value)); + set_values(prim, rnd_vec); } else { throw std::runtime_error("get_mem: Unsupported precision"); } @@ -186,6 +192,12 @@ class BaseFusingTest : public ::testing::TestWithParam { } else if (l.data_type == data_types::u8) { VF rnd_vec = rg.generate_random_1d(s.count(), min, max); set_values(prim, rnd_vec); + } else if (l.data_type == data_types::i4) { + VF rnd_vec = rg.generate_random_1d(s.count()/2, min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::u4) { + VF rnd_vec = rg.generate_random_1d(s.count()/2, min, max); + set_values(prim, rnd_vec); } return prim; From 94f647dc5f0e3a349aadabf4ae377aa6a2b063b4 Mon Sep 17 00:00:00 2001 From: Pawel Raasz Date: Fri, 6 Dec 2024 16:50:59 +0100 Subject: [PATCH 09/23] [core] Extend Core API to accept std::filesystem::path when build with cpp17 (#27950) ### Details: - The `ov::Core` accepts `std::filesytem::path` in functions where string as path is used. ### Tickets: - CVS-157908 --------- Signed-off-by: Pawel Raasz --- .../shape_inference/include/ov_optional.hpp | 4 ++ src/core/tests/pattern.cpp | 6 +- .../include/openvino/runtime/core.hpp | 50 ++++++++++++++ .../tests/functional/ov_core_test.cpp | 69 +++++++++++++++++-- .../tests/functional/ov_extension_test.cpp | 6 ++ 5 files changed, 126 insertions(+), 9 deletions(-) diff --git a/src/core/shape_inference/include/ov_optional.hpp b/src/core/shape_inference/include/ov_optional.hpp index f7f8b474f9a5a6..15973ae0c8a5f8 100644 --- a/src/core/shape_inference/include/ov_optional.hpp +++ b/src/core/shape_inference/include/ov_optional.hpp @@ -7,6 +7,9 @@ #include namespace ov { +#ifdef OPENVINO_CPP_17_VER +using optional = std::optional; +#else /** * @brief Store optional object of type T (basic version of std::optional). @@ -132,4 +135,5 @@ class optional { bool m_has_value = false; Storage m_opt{}; }; +#endif } // namespace ov diff --git a/src/core/tests/pattern.cpp b/src/core/tests/pattern.cpp index 050c36b65baad1..982e59b55f0f97 100644 --- a/src/core/tests/pattern.cpp +++ b/src/core/tests/pattern.cpp @@ -558,8 +558,8 @@ TEST(pattern, multiple_optionals_in_row) { // Pattern: auto in = wrap_type(); - auto pattern_convert = optional(in); - auto pattern_relu = optional(pattern_convert); + auto pattern_convert = pattern::optional(in); + auto pattern_relu = pattern::optional(pattern_convert); auto pattern_sigmoid = wrap_type({pattern_relu}); // Test: @@ -1255,4 +1255,4 @@ TEST(pattern, pattern_optional_root) { // Should perfectly match ASSERT_TRUE(tm.match(pattern_relu, model_relu)); -} \ No newline at end of file +} diff --git a/src/inference/include/openvino/runtime/core.hpp b/src/inference/include/openvino/runtime/core.hpp index f0ba27c1cf5daa..c13432d664e736 100644 --- a/src/inference/include/openvino/runtime/core.hpp +++ b/src/inference/include/openvino/runtime/core.hpp @@ -25,6 +25,10 @@ #include "openvino/runtime/remote_context.hpp" #include "openvino/runtime/tensor.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { /** @@ -95,9 +99,18 @@ class OPENVINO_RUNTIME_API Core { * * TF (*.pb) * * TFLite (*.tflite) * @return A model. + * @{ */ std::shared_ptr read_model(const std::string& model_path, const std::string& bin_path = {}) const; +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + std::shared_ptr read_model(const Path& model_path, const Path& bin_path = {}) const { + return read_model(model_path.string(), bin_path.string()); + } +#endif + /// @} + /** * @brief Reads models from IR / ONNX / PDPD / TF / TFLite formats. * @param model String with a model in IR / ONNX / PDPD / TF / TFLite format. @@ -197,6 +210,13 @@ class OPENVINO_RUNTIME_API Core { */ CompiledModel compile_model(const std::string& model_path, const AnyMap& properties = {}); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const AnyMap& properties = {}) const { + return compile_model(model_path.string(), properties); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT CompiledModel compile_model(const std::wstring& model_path, const AnyMap& properties = {}); #endif @@ -223,6 +243,13 @@ class OPENVINO_RUNTIME_API Core { return compile_model(model_path, AnyMap{std::forward(properties)...}); } +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, Properties&&... properties) { + return compile_model(model_path.string(), std::forward(properties)...); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT template util::EnableIfAllStringAny compile_model(const std::wstring& model_path, @@ -250,6 +277,13 @@ class OPENVINO_RUNTIME_API Core { const std::string& device_name, const AnyMap& properties = {}); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const std::string& device_name, const AnyMap& properties = {}) { + return compile_model(model_path.string(), device_name, properties); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT CompiledModel compile_model(const std::wstring& model_path, const std::string& device_name, @@ -279,6 +313,13 @@ class OPENVINO_RUNTIME_API Core { return compile_model(model_path, device_name, AnyMap{std::forward(properties)...}); } +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const std::string& device_name, Properties&&... properties) { + return compile_model(model_path.string(), device_name, std::forward(properties)...); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT template util::EnableIfAllStringAny compile_model(const std::wstring& model_path, @@ -359,9 +400,18 @@ class OPENVINO_RUNTIME_API Core { /** * @brief Registers an extension to a Core object. * @param library_path Path to the library with ov::Extension. + * @{ */ void add_extension(const std::string& library_path); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + void add_extension(const Path& model_path) { + add_extension(model_path.string()); + } +#endif + /// @} + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT /** * @brief Registers an extension to a Core object. diff --git a/src/inference/tests/functional/ov_core_test.cpp b/src/inference/tests/functional/ov_core_test.cpp index 26eb38e3fd13e5..60f91b85b3338a 100644 --- a/src/inference/tests/functional/ov_core_test.cpp +++ b/src/inference/tests/functional/ov_core_test.cpp @@ -8,9 +8,26 @@ #include "common_test_utils/common_utils.hpp" #include "common_test_utils/file_utils.hpp" +#include "functional_test_utils/test_model/test_model.hpp" #include "openvino/runtime/core.hpp" #include "openvino/util/file_util.hpp" +class CoreBaseTest : public testing::Test { +protected: + void generate_test_model_files(const std::string& name) { + auto prefix = ov::test::utils::generateTestFilePrefix(); + model_file_name = prefix + name + ".xml"; + weight_file_name = prefix + name + ".bin"; + ov::test::utils::generate_test_model(model_file_name, weight_file_name); + } + + void TearDown() override { + ov::test::utils::removeIRFiles(model_file_name, weight_file_name); + } + + std::string model_file_name, weight_file_name; +}; + #ifndef OPENVINO_STATIC_LIBRARY static void create_plugin_xml(const std::string& file_name, const std::string& plugin_name = "1") { @@ -33,7 +50,7 @@ static void remove_plugin_xml(const std::string& file_name) { ov::test::utils::removeFile(file_name); } -TEST(CoreBaseTest, LoadPluginXML) { +TEST_F(CoreBaseTest, LoadPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -42,7 +59,7 @@ TEST(CoreBaseTest, LoadPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) { +TEST_F(CoreBaseTest, LoadPluginDifferentXMLExtension) { std::string xml_file_name = "test_plugin.test"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -51,7 +68,7 @@ TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { +TEST_F(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -60,7 +77,7 @@ TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -69,7 +86,7 @@ TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadRelativeCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -78,7 +95,7 @@ TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string cwd_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -96,3 +113,43 @@ TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { } #endif + +#if defined(OPENVINO_CPP_VER_17) && defined(ENABLE_OV_IR_FRONTEND) +namespace ov::test { +TEST_F(CoreBaseTest, read_model_with_std_fs_path) { + generate_test_model_files("test-model"); + + const auto model_path = std::filesystem::path(model_file_name); + const auto weight_path = std::filesystem::path(weight_file_name); + + ov::Core core; + { + const auto model = core.read_model(model_path); + EXPECT_NE(model, nullptr); + } + { + const auto model = core.read_model(model_path, weight_path); + EXPECT_NE(model, nullptr); + } +} + +TEST_F(CoreBaseTest, compile_model_with_std_fs_path) { + generate_test_model_files("model2"); + + const auto model_path = std::filesystem::path(model_file_name); + const auto weight_path = std::filesystem::path(weight_file_name); + + ov::Core core; + { + const auto model = core.compile_model(model_path); + EXPECT_TRUE(model); + } + { + const auto devices = core.get_available_devices(); + + const auto model = core.compile_model(model_path, devices.at(0), ov::AnyMap{}); + EXPECT_TRUE(model); + } +} +} // namespace ov::test +#endif diff --git a/src/inference/tests/functional/ov_extension_test.cpp b/src/inference/tests/functional/ov_extension_test.cpp index 6f93a8acdaf2fa..b840c430d092e9 100644 --- a/src/inference/tests/functional/ov_extension_test.cpp +++ b/src/inference/tests/functional/ov_extension_test.cpp @@ -82,6 +82,12 @@ class CustomReLU : public ov::op::Op { }; #if defined(ENABLE_OV_IR_FRONTEND) +# ifdef OPENVINO_CPP_VER_17 +TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsPathLib) { + core.add_extension(std::filesystem::path(getOVExtensionPath())); + test(); +} +# endif TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsLib) { core.add_extension(getOVExtensionPath()); From e8fa9f7b84d1d19e4581f56ef4dd8e88934b878e Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Fri, 6 Dec 2024 20:29:49 +0400 Subject: [PATCH 10/23] [TF FE] Run HSVToRGB tests on all platforms (#27945) **Details:** Run HSVToRGB tests on all platforms **Ticket:** TBD --------- Signed-off-by: Kazantsev, Roman --- .../tensorflow_tests/test_tf_HSVToRGB.py | 53 +++++++------------ 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py b/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py index 9f3ab9845fb24f..17df8c52430ec5 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py @@ -1,32 +1,28 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest +rng = np.random.default_rng(23345) + + class TestHSVToRGB(CommonTFLayerTest): def _prepare_input(self, inputs_info): assert 'images:0' in inputs_info - if self.special_case == "Black Image": - images_shape = inputs_info['images:0'] - inputs_data = {} - inputs_data['images:0'] = np.zeros(images_shape).astype(self.input_type) - elif self.special_case == "Grayscale Image": - images_shape = inputs_info['images:0'] - inputs_data = {} + images_shape = inputs_info['images:0'] + inputs_data = {} + if self.special_case == 'Black Image': + inputs_data['images:0'] = np.zeros(images_shape).astype(self.input_type) + elif self.special_case == 'Grayscale Image': inputs_data['images:0'] = np.broadcast_to([0, 0, 0.5], images_shape).astype(self.input_type) else: - images_shape = inputs_info['images:0'] - inputs_data = {} - inputs_data['images:0'] = np.random.rand(*images_shape).astype(self.input_type) - + inputs_data['images:0'] = rng.uniform(0.0, 1.0, images_shape).astype(self.input_type) return inputs_data - def create_hsv_to_rgb_net(self, input_shape, input_type, special_case=False): + def create_hsv_to_rgb_net(self, input_shape, input_type, special_case): self.special_case = special_case self.input_type = input_type tf.compat.v1.reset_default_graph() @@ -39,27 +35,16 @@ def create_hsv_to_rgb_net(self, input_shape, input_type, special_case=False): return tf_net, None - # Each input is a tensor of with values in [0,1]. - # The last dimension must be size 3. - test_data_basic = [ - dict(input_shape=[7, 7, 3], input_type=np.float32, special_case="Black Image"), - dict(input_shape=[7, 7, 3], input_type=np.float32, special_case="Grayscale Image"), - dict(input_shape=[5, 5, 3], input_type=np.float32), - dict(input_shape=[5, 23, 27, 3], input_type=np.float64), - dict(input_shape=[3, 4, 13, 15, 3], input_type=np.float64), - ] - - @pytest.mark.parametrize("params", test_data_basic) + @pytest.mark.parametrize('input_shape', [[3], [5, 3], [4, 5, 3], [5, 21, 21, 3]]) + @pytest.mark.parametrize('input_type', [np.float16, np.float32, np.float64]) + @pytest.mark.parametrize('special_case', [None, 'Black Image', 'Grayscale Image']) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') - def test_hsv_to_rgb_basic(self, params, ie_device, precision, ir_version, temp_dir, - use_legacy_frontend): + def test_hsv_to_rgb_basic(self, input_shape, input_type, special_case, + ie_device, precision, ir_version, temp_dir, + use_legacy_frontend): if ie_device == 'GPU': - pytest.skip("Accuracy mismatch on GPU") - self._test(*self.create_hsv_to_rgb_net(**params), + pytest.skip('158898: accuracy issue on GPU') + self._test(*self.create_hsv_to_rgb_net(input_shape, input_type, special_case), ie_device, precision, ir_version, temp_dir=temp_dir, - use_legacy_frontend=use_legacy_frontend) + use_legacy_frontend=use_legacy_frontend, custom_eps=3 * 1e-3) From b840082ac11b1608f349d9554b020498c328164f Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Mon, 9 Dec 2024 14:09:30 +0900 Subject: [PATCH 11/23] [GPU] Integrate dynamic quantization for onednn (#26940) ### Details: - Integrated grouped dynamic quantization from onednn - Integrated asymmetric per-token dynamic quantization from onednn - Those are not enabled by default, yet ### Tickets: - 148732, 157869, 157589 --- .../op/fully_connected_compressed.hpp | 1 + .../intel_gpu/primitives/dynamic_quantize.hpp | 13 +- .../intel_gpu/primitives/fully_connected.hpp | 18 +++ .../intel_gpu/runtime/debug_configuration.hpp | 1 + .../prepare_primitive_fusing.cpp | 2 + .../src/graph/impls/ocl/dynamic_quantize.cpp | 8 +- .../impls/onednn/fully_connected_onednn.cpp | 47 +++++-- .../impls/onednn/fully_connected_onednn.hpp | 2 +- .../cl_kernels/dynamic_quantize_gpu_opt.cl | 133 ++++++++++++++++-- .../cl_kernels/dynamic_quantize_gpu_ref.cl | 50 ++++--- .../dynamic_quantize_kernel_opt.cpp | 56 +++++--- .../dynamic_quantize_kernel_ref.cpp | 18 ++- .../fully_connected_kernel_bf_tiled.cpp | 20 +-- .../src/plugin/ops/dynamic_quantize.cpp | 3 +- .../src/plugin/ops/fully_connected.cpp | 4 +- .../intel_gpu/src/plugin/program_builder.cpp | 4 + .../dynamic_quantize_fully_connected.cpp | 30 ++-- .../op/fully_connected_compressed.cpp | 5 +- .../src/plugin/transformations_pipeline.cpp | 22 ++- .../src/runtime/debug_configuration.cpp | 3 + .../src/runtime/execution_config.cpp | 7 +- .../dynamic/matmul_weights_decompression.cpp | 33 +++-- .../test_cases/dynamic_quantize_gpu_test.cpp | 61 +++++--- .../test_cases/fully_connected_gpu_test.cpp | 24 ++-- .../unit/test_cases/hash_key_gpu_test.cpp | 8 +- 25 files changed, 420 insertions(+), 153 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp index 1112a3785317a3..e58c6ab4cb17f1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp @@ -22,6 +22,7 @@ class FullyConnectedCompressed : public FullyConnected { const ov::Output &w_decompression_scale, const ov::Output &w_decompression_zero_point, const ov::Output &a_decompression_scale, + const ov::Output &a_decompression_zero_point, const ov::element::Type output_type = ov::element::undefined); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index 79af223e32cdaa..8dd1ebf2809782 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -26,9 +26,11 @@ struct dynamic_quantize : public primitive_base { /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const Attributes& attrs) + const Attributes& attrs, + const size_t input_size = 3) : primitive_base(id, {input}) - , attrs(attrs) { + , attrs(attrs) + , input_size(input_size) { num_outputs = 2; if (attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) @@ -36,6 +38,7 @@ struct dynamic_quantize : public primitive_base { } Attributes attrs; + size_t input_size; size_t hash() const override { size_t seed = primitive::hash(); @@ -46,6 +49,7 @@ struct dynamic_quantize : public primitive_base { seed = hash_combine(seed, attrs.scale_dt.hash()); seed = hash_combine(seed, attrs.zp_dt.hash()); seed = hash_combine(seed, attrs.output_storage_type); + seed = hash_combine(seed, input_size); return seed; } @@ -62,7 +66,8 @@ struct dynamic_quantize : public primitive_base { attrs.quantization_dt == rhs_casted.attrs.quantization_dt && attrs.scale_dt == rhs_casted.attrs.scale_dt && attrs.zp_dt == rhs_casted.attrs.zp_dt && - attrs.quantization_type == rhs_casted.attrs.quantization_type;; + attrs.quantization_type == rhs_casted.attrs.quantization_type && + input_size == rhs_casted.input_size; } void save(BinaryOutputBuffer& ob) const override { @@ -75,6 +80,7 @@ struct dynamic_quantize : public primitive_base { ob << make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type)); ob << attrs.scales_zp_output_order; ob << attrs.group_sizes; + ob << input_size; } void load(BinaryInputBuffer& ib) override { @@ -87,6 +93,7 @@ struct dynamic_quantize : public primitive_base { ib >> make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type)); ib >> attrs.scales_zp_output_order; ib >> attrs.group_sizes; + ib >> input_size; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp index e39078cb1011cc..0819a39534696d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp @@ -96,6 +96,7 @@ struct fully_connected : public primitive_base { decompression_scale(decompression_scale), decompression_zero_point(decompression_zero_point), dynamic_quantized_activation(false), + dynamic_quantized_activation_zp(false), input_size(input_size), weights_rank(weights_rank) { OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); @@ -109,6 +110,7 @@ struct fully_connected : public primitive_base { /// @param compression_scale Primitive id containing scale factors for weights decompression. /// @param compression_zero_point Primitive id containing zero points for weights decompression. /// @param activation_scale Primitive id containing scale factor for activation. + /// @param activation_zero_point Primitive id containing zero point for activation. fully_connected(const primitive_id& id, const input_info& input, const primitive_id& weights, @@ -116,6 +118,7 @@ struct fully_connected : public primitive_base { const primitive_id& decompression_scale, const primitive_id& decompression_zero_point, const input_info& activation_scale, + const input_info& activation_zero_point, const data_types data_type, const size_t input_size = 2, const size_t weights_rank = 2) @@ -126,11 +129,15 @@ struct fully_connected : public primitive_base { decompression_scale(decompression_scale), decompression_zero_point(decompression_zero_point), dynamic_quantized_activation(false), + dynamic_quantized_activation_zp(false), activation_scale(activation_scale), + activation_zero_point(activation_zero_point), input_size(input_size), weights_rank(weights_rank) { if (activation_scale.is_valid()) dynamic_quantized_activation = true; + if (activation_zero_point.is_valid()) + dynamic_quantized_activation_zp = true; OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input"); } @@ -144,7 +151,9 @@ struct fully_connected : public primitive_base { primitive_id decompression_scale = ""; primitive_id decompression_zero_point = ""; bool dynamic_quantized_activation = false; + bool dynamic_quantized_activation_zp = false; input_info activation_scale = {"", 0}; + input_info activation_zero_point = {"", 0}; optional_value decompression_zero_point_scalar = optional_value(); /// @brief Primitive dimension size. @@ -161,6 +170,7 @@ struct fully_connected : public primitive_base { seed = hash_combine(seed, !decompression_scale.empty()); seed = hash_combine(seed, !decompression_zero_point.empty()); seed = hash_combine(seed, activation_scale.is_valid()); + seed = hash_combine(seed, activation_zero_point.is_valid()); seed = hash_combine(seed, decompression_zero_point_scalar.has_value()); seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f)); return seed; @@ -179,6 +189,7 @@ struct fully_connected : public primitive_base { decompression_scale.empty() == rhs_casted.decompression_scale.empty() && decompression_zero_point.empty() == rhs_casted.decompression_zero_point.empty() && activation_scale.is_valid() == rhs_casted.activation_scale.is_valid() && + activation_zero_point.is_valid() == rhs_casted.activation_zero_point.is_valid() && decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f); } @@ -190,9 +201,11 @@ struct fully_connected : public primitive_base { ob << decompression_scale; ob << decompression_zero_point; ob << activation_scale; + ob << activation_zero_point; ob << input_size; ob << weights_rank; ob << dynamic_quantized_activation; + ob << dynamic_quantized_activation_zp; if (decompression_zero_point_scalar.has_value()) { ob << true; @@ -211,9 +224,11 @@ struct fully_connected : public primitive_base { ib >> decompression_scale; ib >> decompression_zero_point; ib >> activation_scale; + ib >> activation_zero_point; ib >> input_size; ib >> weights_rank; ib >> dynamic_quantized_activation; + ib >> dynamic_quantized_activation_zp; bool has_value; ib >> has_value; @@ -243,6 +258,9 @@ struct fully_connected : public primitive_base { if (activation_scale.is_valid()) ret.push_back(activation_scale); + if (activation_zero_point.is_valid()) + ret.push_back(activation_zero_point); + return ret; } }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index a7a8ae1f229a72..52d828353fa155 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -146,6 +146,7 @@ class debug_configuration { std::vector dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization int use_kv_cache_compression; // Enable KV-cache compression int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size + int dynamic_quantize_asym; // Use asymmetric dynamic quantization int disable_horizontal_fc_fusion; // Disable fc horizontal fusion int disable_fc_swiglu_fusion; // Disable swiglu fusion to fc std::set dump_iteration; // Dump n-th execution of network. diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 29b7cf58a19b54..93f0905b3a1ef7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -463,7 +463,9 @@ void prepare_primitive_fusing::fuse_bias(program &p) { if (desc->decompression_zero_point_scalar.has_value()) fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value(); fc_with_bias_prim->activation_scale = desc->activation_scale; + fc_with_bias_prim->activation_zero_point = desc->activation_zero_point; fc_with_bias_prim->dynamic_quantized_activation = desc->dynamic_quantized_activation; + fc_with_bias_prim->dynamic_quantized_activation_zp = desc->dynamic_quantized_activation_zp; } auto& new_fc_node = p.get_or_create(fc_with_bias_prim); fuse_bias_f(fc, new_fc_node, bias_node, eltw_node); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index b9fe00ac525720..ca628a48ac76e0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -35,6 +35,7 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { auto params = get_default_params(impl_param, is_shape_agnostic); + const auto& primitive = impl_param.typed_desc(); params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); // In Some model, the feature size could be dynamic in input0. @@ -48,6 +49,10 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { if (impl_param.output_layouts.size() > 2) params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(2))); + // Keep 2d data as bf layout + if (primitive->input_size == 2) + params.outputs[0] = params.outputs[0].FlattenFeatureAndSpatials(); + const auto& desc = impl_param.typed_desc(); params.group_sizes = desc->attrs.group_sizes; params.scales_output_order = desc->attrs.scales_zp_output_order; @@ -68,7 +73,8 @@ namespace detail { attach_dynamic_quantize_impl::attach_dynamic_quantize_impl() { auto types = { data_types::f16, - data_types::i8 + data_types::i8, + data_types::u8 }; auto formats = { diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 6b93b279129812..6cca9848af3472 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -83,10 +83,16 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { if (prim->activation_scale.is_valid()) { auto activation_scale_idx = idx++; auto act_scale_mem = instance.dep_memory_ptr(activation_scale_idx); - // TODO: handle group_size here - dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::a, true); + dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::ab, true); args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, act_scale_mem->get_onednn_memory(desc)}); } + + if (prim->activation_zero_point.is_valid()) { + auto activation_zp_idx = idx++; + auto act_zp_mem = instance.dep_memory_ptr(activation_zp_idx); + dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_zp_mem->get_layout(), dnnl::memory::format_tag::ab, true); + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC_0, act_zp_mem->get_onednn_memory(desc)}); + } } return args; @@ -245,6 +251,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { ob << has_bias; ob << is_compressed; ob << prim->dynamic_quantized_activation; + ob << prim->dynamic_quantized_activation_zp; bool has_decompression_scale = !prim->decompression_scale.empty(); if (has_decompression_scale) { @@ -271,10 +278,12 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { bool has_bias = false; bool is_compressed = false; bool dynamic_quantized_activation; + bool dynamic_quantized_activation_zp; ib >> input_size; ib >> has_bias; ib >> is_compressed; ib >> dynamic_quantized_activation; + ib >> dynamic_quantized_activation_zp; const kernel_impl_params* impl_params = reinterpret_cast(ib.getKernelImplParams()); auto prim = impl_params->typed_desc(); @@ -293,11 +302,12 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { bool has_decompression_zp = !prim->decompression_zero_point.empty() || prim->decompression_zero_point_scalar.has_value(); auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as(); - int idx = !arg.bias_term() ? 3 : 4; + int idx = !arg.bias_term() ? 2 : 3; if (has_decompression_zp) { ib >> make_data(&_dzp_data_type, sizeof(dnnl::memory::data_type)); - auto dzp_layout = arg.get_dependency(idx++).get_output_layout(); + auto decompression_zp_idx = ++idx; + auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout(); if (dzp_layout.count() == 1) { _attrs->set_zero_points(DNNL_ARG_WEIGHTS, COMMON, dnnl::memory::dims{}, _dzp_data_type); @@ -312,12 +322,17 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } if (dynamic_quantized_activation) { - // TODO: it supports per-token activation scale only + auto src_scale_idx = ++idx; auto partial_shape = impl_params->get_input_layout(0).get_partial_shape(); auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); - - auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(idx).data_type); - _attrs->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + auto& src_scale_shape = impl_params->input_layouts[src_scale_idx].get_partial_shape(); + int src_scale_ngroups = src_scale_shape[src_scale_shape.size() - 1].get_length(); + int src_group_size = innermost_len / src_scale_ngroups; + + auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(src_scale_idx).data_type); + _attrs->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, act_scale_data_type); + if (dynamic_quantized_activation_zp) + _attrs->set_zero_points(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, dnnl::memory::data_type::u8); } if (is_compressed) { @@ -387,15 +402,21 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } if (prim->dynamic_quantized_activation) { - // Note: it supports per-token activation scale only - ++idx; - auto partial_shape = impl_params.input_layouts[0].get_partial_shape(); + auto src_scale_idx = ++idx; + auto& partial_shape = impl_params.input_layouts[0].get_partial_shape(); auto innermost_len = partial_shape[partial_shape.size() - 1].get_length(); + auto& src_scale_shape = impl_params.input_layouts[src_scale_idx].get_partial_shape(); + int src_scale_ngroups = src_scale_shape[src_scale_shape.size() - 1].get_length(); + int src_group_size = innermost_len / src_scale_ngroups; - auto act_scale_data_type = convert_data_type(impl_params.input_layouts[idx].data_type); - attr->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type); + auto act_scale_data_type = convert_data_type(impl_params.input_layouts[src_scale_idx].data_type); + attr->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, act_scale_data_type); + + if (prim->activation_zero_point.is_valid()) + attr->set_zero_points(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, dnnl::memory::data_type::u8); } + auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(), prim->input_size, !prim->bias.empty(), *attr); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index 17498831a542d1..62129866927ea4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -48,7 +48,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { one_of(wei_dt, {data_types::i8, data_types::u8}) && one_of(out_dt, {data_types::f16, data_types::f32, data_types::i32, data_types::i8, data_types::u8}); bool compressed_case = fc_prim->compressed_weights && - one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) && + one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8, data_types::u8}) && one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) && one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8}); if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl index 6db1790844e501..22c620d712770c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -4,77 +4,180 @@ #include "include/batch_headers/fetch_data.cl" -#if OUTPUT_DIMS != 4 +#if OUTPUT_DIMS != 4 && OUTPUT_DIMS != 2 #error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" #endif #define VLOAD_N CAT(vload, VEC_SIZE) #define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_UCHAR_N CAT(convert_uchar, VEC_SIZE) #define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) #define AS_TYPE_N_(type, n, x) as_##type##n(x) #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) #define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) +#if QUANTIZE_GROUP_SIZE <= 128 + +#if ASYMMETRIC_QUANTIZATION +#error "UNIMPLMENTED: asymmetric quantization when group size is small" +#endif + +KERNEL(dynamic_quantize_gpu_opt)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale + ) { + +#if OUTPUT_DIMS == 2 + const uint b = get_global_id(0); + const uint f_grp = get_global_id(1); + const uint input_offset = INPUT0_GET_INDEX(b, f_grp * QUANTIZE_GROUP_SIZE, 0, 0); + const uint output_offset = OUTPUT_GET_INDEX(b, f_grp * QUANTIZE_GROUP_SIZE, 0, 0); +#else + const uint bf = get_global_id(0); + const uint b = bf / INPUT0_FEATURE_NUM; + const uint f = bf % INPUT0_FEATURE_NUM; + const uint y_grp = get_global_id(1); + const uint input_offset = INPUT0_GET_INDEX(b, f, y_grp * QUANTIZE_GROUP_SIZE, 0); + const uint output_offset = OUTPUT_GET_INDEX(b, f, y_grp * QUANTIZE_GROUP_SIZE, 0); + +#endif + const uint quantize_block = QUANTIZE_GROUP_SIZE / 4; + half4 input_0[quantize_block]; + char4 quantized_value[quantize_block]; + half max[quantize_block]; + + unroll_for (uint i = 0 ; i < quantize_block; ++i) { + input_0[i] = vload4(0, &input[input_offset + i * 4]); + max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3]))); + } + + half max_value = fmax(0.001h, max[0]); + for (uint i = 1; i < quantize_block; i++) { + max_value = fmax(max_value, max[i]); + } + + half quan_scale = 128.0h / max_value; + + unroll_for (uint i = 0 ; i < quantize_block; ++i) { + quantized_value[i] = convert_char4(input_0[i] * (half4)quan_scale); + vstore4(quantized_value[i], 0, &output[output_offset + i * 4]); + } + +#if OUTPUT_DIMS == 2 + output_scale[OUTPUT1_GET_INDEX(b, f_grp, 0, 0)] = 1.0h / quan_scale; +#else + output_scale[OUTPUT1_GET_INDEX(b, f, y_grp, 0)] = 1.0h / quan_scale; +#endif +} + +#else // !(QUANTIZE_GROUP_SIZE <= 128) + REQD_SUB_GROUP_SIZE(SIMD) KERNEL(dynamic_quantize_gpu_opt)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, - __global OUTPUT1_TYPE* output_scale) + __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION + , __global OUTPUT2_TYPE* output_zp +#endif + ) { const uint bf = (uint)get_global_id(2); const uint sglid = get_sub_group_local_id(); const uint local_id = (uint)get_local_id(1); const uint block_size = SIMD * VEC_SIZE; +#if OUTPUT_DIMS == 2 + const uint b_offset = bf * INPUT0_BATCH_PITCH; +#else const uint b_offset = bf * INPUT0_FEATURE_PITCH; - +#endif const uint offset = b_offset + VEC_SIZE * sglid; const uint iteration = ALIGNED_BLOCK_NUM / BLOCK_NUM; - __local half local_mem[BLOCK_NUM]; + __local half local_mem_max[BLOCK_NUM]; + __local half local_mem_min[BLOCK_NUM]; MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) val[iteration]; MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) abs_val; - half max = 0.0h; half grp_max = 0.001h; - half max_value; + half grp_min = 0.001h; + half max_value = 0.0h; + half min_value = 0.0h; unroll_for(int i = 0; i < iteration; ++i) { if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM) continue; val[i] = AS_INPUT_TYPE_N(VLOAD_N(0, input + offset + ((local_id * iteration + i) * block_size))); - abs_val = fabs(val[i]); - +#if ASYMMETRIC_QUANTIZATION unroll_for (int j = 0; j < VEC_SIZE; j++) { - max = fmax(max, abs_val[j]); + max_value = fmax(max_value, val[i][j]); + min_value = fmin(min_value, val[i][j]); } + grp_max = fmax(grp_max, max_value); + grp_min = fmin(grp_min, min_value); +#else + abs_val = fabs(val[i]); + + unroll_for (int j = 0; j < VEC_SIZE; j++) + max_value = fmax(max_value, abs_val[j]); - grp_max = fmax(grp_max, max); + grp_max = fmax(grp_max, max_value); +#endif } max_value = sub_group_reduce_max(grp_max); - if (sglid == 0) - local_mem[local_id] = max_value; +#if ASYMMETRIC_QUANTIZATION + min_value = sub_group_reduce_min(grp_min); +#endif + + if (sglid == 0) { + local_mem_max[local_id] = max_value; +#if ASYMMETRIC_QUANTIZATION + local_mem_min[local_id] = min_value; +#endif + } barrier(CLK_LOCAL_MEM_FENCE); for (int j = 0; j < BLOCK_NUM; j++) { - max_value = fmax(max_value, local_mem[j]); + max_value = fmax(max_value, local_mem_max[j]); +#if ASYMMETRIC_QUANTIZATION + min_value = fmin(min_value, local_mem_min[j]); +#endif } - half scale = 127.0h / max_value; +#if ASYMMETRIC_QUANTIZATION + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + OUTPUT2_TYPE zp = (OUTPUT2_TYPE)(-min_value * scale); +#else + OUTPUT1_TYPE scale = 127.0h / max_value; +#endif + unroll_for(int i = 0; i < iteration; ++i) { if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM) continue; val[i] *= scale; +#if ASYMMETRIC_QUANTIZATION + val[i] += zp; + VSTORE_N(CAT(CONVERT_UCHAR_N, _rte)(val[i]), 0, output + offset + ((local_id * iteration + i) * block_size)); +#else VSTORE_N(CAT(CONVERT_CHAR_N, _rte)(val[i]), 0, output + offset + ((local_id * iteration + i) * block_size)); +#endif } - if (sglid == 0 && local_id == 0) + if (sglid == 0 && local_id == 0) { output_scale[bf] = 1.0h / scale; +#if ASYMMETRIC_QUANTIZATION + output_zp[bf] = convert_uchar_rte(zp); +#endif + } } +#endif // QUANTIZE_GROUP_SIZE <= 128 diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 62482b8b9b5047..4acf87eb37ceb0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -4,6 +4,16 @@ #include "include/batch_headers/fetch_data.cl" +#define UINT64_MAX 0xFFFFFFFFFFFFFFFF + +#if ASYMMETRIC_QUANTIZATION && UNSIGNED_OUTPUT + #define TO_OUTPUT_TYPE_RTE(val) convert_uchar_rte(val) + #define TO_OUTPUT_VEC_TYPE_RTE(val) convert_uchar8_rte(val) +#else + #define TO_OUTPUT_TYPE_RTE(val) convert_char_rte(val) + #define TO_OUTPUT_VEC_TYPE_RTE(val) convert_char8_rte(val) +#endif + #if OUTPUT_DIMS != 4 #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" #endif @@ -33,19 +43,21 @@ KERNEL(dynamic_quantize_gpu_ref)( const uint bf = (uint)get_global_id(0); const uint b = bf / INPUT0_FEATURE_NUM; const uint f = bf % INPUT0_FEATURE_NUM; - const uint y = (uint)get_global_id(1); + const uint out_y = (uint)get_global_id(1); + const uint y = out_y * GROUP_SIZE_DIM2; // quantization may be grouped for y axis const uint x = (uint)get_global_id(2); #ifdef SCALES_OUTPUT_ORDER - const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, out_y, x); #else - const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x); + const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, out_y, x); #endif half max_val = INPUT0_VAL_MIN; half min_val = INPUT0_VAL_MAX; for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { - for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == UINT64_MAX ? INPUT0_SIZE_Y : GROUP_SIZE_DIM2); y_off++) { + // It is assumed that grouped quantization happens only for 3d input case where we don't have x axis #if GROUP_SIZE_DIM3 == 1 const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); half val = input[offset]; @@ -88,53 +100,49 @@ KERNEL(dynamic_quantize_gpu_ref)( #if ASYMMETRIC_QUANTIZATION OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_val - min_val)); +# if UNSIGNED_OUTPUT + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale); +# else // !UNSIGNED_OUTPUT OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale) - CHAR_MAX; -#else +# endif +#else // !ASYMMETRIC_QUANTIZATION max_val = work_group_reduce_max(max_val); OUTPUT1_TYPE scale = 127.0h / max_val; #endif for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { - for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == UINT64_MAX ? INPUT0_SIZE_Y : GROUP_SIZE_DIM2); y_off++) { #if GROUP_SIZE_DIM3 == 1 const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x); half val = input[in_offset]; -#if ASYMMETRIC_QUANTIZATION val *= scale; +#if ASYMMETRIC_QUANTIZATION val += zp; - output[out_offset] = convert_char_rte(val); -#else - val *= scale; - output[out_offset] = convert_char_rte(val); #endif + output[out_offset] = TO_OUTPUT_TYPE_RTE(val); #else const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); -#if ASYMMETRIC_QUANTIZATION val *= scale; +#if ASYMMETRIC_QUANTIZATION val += zp; -#else - val *= scale; #endif - vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8); + vstore8(TO_OUTPUT_VEC_TYPE_RTE(val), 0, output + out_offset + x * 8); } x *= 8; for (; x < INPUT0_SIZE_X; x++) { half val = input[in_offset + x]; -#if ASYMMETRIC_QUANTIZATION val *= scale; +#if ASYMMETRIC_QUANTIZATION val += zp; - output[out_offset + x] = convert_char_rte(val); -#else - val *= scale; - output[out_offset + x] = convert_char_rte(val); #endif + output[out_offset + x] = TO_OUTPUT_TYPE_RTE(val); } #endif } @@ -145,6 +153,6 @@ KERNEL(dynamic_quantize_gpu_ref)( #if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP output_scale[scale_idx + 1] = zp; #elif ASYMMETRIC_QUANTIZATION - output_zp[scale_idx] = zp; + output_zp[scale_idx] = convert_uchar_rte(zp); #endif } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index 52a648679499f2..b4f667475f26f1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -30,9 +30,11 @@ static std::pair get_input_bf_size(const dynamic_quantize_params static size_t get_match_vector_size(const dynamic_quantize_params& params) { auto block_sizes = { 8, 4, 2 }; + auto bf = get_input_bf_size(params); + auto f = bf.second; for (auto block_size : block_sizes) { - if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) { + if ((f / simd) % block_size == 0) { return block_size; } } @@ -43,10 +45,13 @@ static size_t get_match_vector_size(const dynamic_quantize_params& params) { ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::UINT8); k.EnableOutputDataType(Datatype::INT8); k.EnableDifferentTypes(); - k.EnableAllInputLayout(); - k.EnableAllOutputLayout(); + k.EnableInputLayout(DataLayout::bf); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bf); + k.EnableOutputLayout(DataLayout::bfyx); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); @@ -68,6 +73,8 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num)); jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num)); jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num)); + jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", params.group_sizes.back())); + jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); return jit; @@ -76,15 +83,20 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_params& params) const { CommonDispatchData dispatchData; - auto vec_size = get_match_vector_size(params); - auto bf_size = get_input_bf_size(params); - size_t total_block_num = bf_size.second / (simd * vec_size); - size_t batch = get_input_bf_size(params).first; - size_t block_num = (total_block_num > 32) ? 32 : total_block_num; - - dispatchData.gws = {simd, block_num, batch}; - dispatchData.lws = {simd, block_num, 1}; - + if (params.group_sizes.back() <= 128) { + auto bf_size = get_input_bf_size(params); + dispatchData.gws = {bf_size.first, bf_size.second / params.group_sizes.back(), 1}; + dispatchData.lws = {1, 1, 1}; + } else { + auto vec_size = get_match_vector_size(params); + auto bf_size = get_input_bf_size(params); + size_t total_block_num = bf_size.second / (simd * vec_size); + size_t batch = get_input_bf_size(params).first; + size_t block_num = (total_block_num > 32) ? 32 : total_block_num; + + dispatchData.gws = {simd, block_num, batch}; + dispatchData.lws = {simd, block_num, 1}; + } return dispatchData; } @@ -147,8 +159,9 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { const auto& dq_params = static_cast(params); - // Todo : Add proper exception here - if (((dq_params.inputs[0].X().v * dq_params.inputs[0].Y().v) % (simd * 2)) != 0) + + auto bf = get_input_bf_size(dq_params); + if (((bf.second) % (simd * 2)) != 0) return false; if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) @@ -157,8 +170,10 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { if (dq_params.append_axis != -1) return false; - if (dq_params.group_sizes.back() != UINT64_MAX) - return false; + for (size_t i = 0; i < dq_params.group_sizes.size() - 1; i++) { + if (dq_params.group_sizes[i] != 1) + return false; + } // Allow only default scales order const auto& scales_output_order = dq_params.scales_output_order; @@ -168,7 +183,16 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { return false; } + if (dq_params.use_asymmetric_quantization) { + if (dq_params.combine_scales_and_zp) + return false; + if (dq_params.outputs[0].GetDType() != Datatype::UINT8) + return false; + } + return true; } + + } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index bd3d0f87cdc931..f432fa6ac5756d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -11,6 +11,7 @@ ParamsKey DynamicQuantizeKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); k.EnableTensorOffset(); @@ -53,6 +54,7 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + jit.AddConstant(MakeJitConstant("UNSIGNED_OUTPUT", params.outputs[0].GetDType() == Datatype::UINT8 ? 1 : 0)); auto group_sizes = params.group_sizes; group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); @@ -71,12 +73,26 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); auto group_sizes = params.group_sizes; - group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); + group_sizes.resize(std::max((size_t)4, group_sizes.size()), 1); auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1; + OPENVINO_ASSERT( + (group_sizes[0] == 1 || group_sizes[0] == params.outputs[0].Batch().v || group_sizes[0] == UINT64_MAX) && + (group_sizes[1] == 1 || group_sizes[1] == params.outputs[0].Feature().v || group_sizes[1] == UINT64_MAX) && + (group_sizes[2] == 1 || group_sizes[2] == params.outputs[0].Y().v || group_sizes[2] == UINT64_MAX + || (params.outputs[0].Y().v % group_sizes[2] == 0 && params.outputs[0].X().v == 1)) && // Grouped quantization is only supported for 3d case + (group_sizes[3] == 1 || group_sizes[3] == params.outputs[0].X().v || group_sizes[3] == UINT64_MAX), + "[GPU] Unsupported dynamic quantization configuration: (", + group_sizes[0], ",", group_sizes[1], ",", group_sizes[2], ",", group_sizes[3], ") - (", + params.outputs[0].Batch().v, ",", params.outputs[0].Feature().v, ",", params.outputs[0].Y().v, ",", params.outputs[0].X().v, ")"); + + // Grouped quantization is supported only over y axis + if (params.group_sizes[2] > 1 && params.group_sizes[2] != UINT64_MAX) + y_size = params.outputs[0].Y().v / params.group_sizes[2]; + dispatchData.gws = {batch_size * feature_size, y_size, x_size}; dispatchData.lws = {1, 1, 1}; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 46e8f7f1104f0d..68da7aea7b1fe6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -124,16 +124,16 @@ static bool should_dynamic_quantize(const fully_connected_params& params, bool p if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) && (params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) && params.inputs[0].GetDType() == Datatype::F16 && is_weight_dyn_quantizable(params)) { - if (print_log) { - GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size << - ", Dyn-quan group size: " << dynamic_quantization_group_size << - ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) << - ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) << - ", W:" << kernel_selector::toString(params.weights.GetDType()) << - "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) << - ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << - ", Y: " << params.inputs[0].Y().v << std ::endl; - } + if (print_log) { + GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size << + ", Dyn-quan group size: " << dynamic_quantization_group_size << + ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) << + ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) << + ", W:" << kernel_selector::toString(params.weights.GetDType()) << + "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) << + ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << + ", Y: " << params.inputs[0].Y().v << std ::endl; + } return true; } diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 85f28cbd711678..4c11bdb21971e9 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -18,7 +18,8 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_attrs()); + op->get_attrs(), + op->get_input_partial_shape(0).size()); prim.num_outputs = op->get_output_size(); diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp index 7b0aa921ef3ad5..5f4fe19c5c4c08 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp @@ -26,7 +26,7 @@ namespace ov { namespace intel_gpu { static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5, 6}); + validate_inputs_count(op, {4, 5, 6, 7}); auto inputs = p.GetInputInfo(op); std::string primitive_name = layer_type_name_ID(op); auto supports_immad = p.get_engine().get_device_info().supports_immad; @@ -39,6 +39,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share const size_t W_ZP_IDX = input_idx; std::string zp_name = op->get_input_size() > input_idx ? inputs[input_idx++].pid : ""; auto activation_scale_input = op->get_input_size() > input_idx ? inputs[input_idx++] : cldnn::input_info(); + auto activation_zero_point_input = op->get_input_size() > input_idx ? inputs[input_idx++] : cldnn::input_info(); float zp_value = 0.0f; bool has_scalar_zp = false; @@ -58,6 +59,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share scale_name, has_scalar_zp && !supports_immad ? "" : zp_name, activation_scale_input, + activation_zero_point_input, cldnn::element_type_to_data_type(op->get_output_element_type(0)), op->get_input_partial_shape(0).size(), op->get_input_partial_shape(1).size()); diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index b623c86fabe02c..368e25abe2ddac 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -10,6 +10,7 @@ #include "openvino/op/lstm_sequence.hpp" #include "openvino/op/loop.hpp" #include "openvino/op/search_sorted.hpp" +#include "ov_ops/dynamic_quantize.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/plugin/program_builder.hpp" @@ -357,6 +358,9 @@ bool ProgramBuilder::requires_new_shape_infer(const std::shared_ptr& o if (ov::is_type(op)) return true; + if (ov::is_type(op)) + return true; + if (ov::is_type(op)) { const auto body_function = std::static_pointer_cast(op)->get_function(); if (body_function->is_dynamic()) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index c36212713ae717..61dc40e2713800 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -21,24 +21,11 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size : ov::pass::MatcherPass() { GPU_DEBUG_GET_INSTANCE(debug_config); using namespace ov::pass::pattern; - - // per-token quantization is supported - if (group_size != UINT64_MAX) { - GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl; - return; - } - auto is_dynamic = [](const ov::Output& output) -> bool { - bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic(); - size_t num_inputs = output.get_node_shared_ptr()->get_input_size(); - for (size_t idx = 0; idx < num_inputs; idx++) { - is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic(); - } - return is_dynamic; - }; + using QuantizationType = ov::op::internal::DynamicQuantize::QuantizationType; auto data = any_input(); - auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}, is_dynamic); - auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}, is_dynamic); + auto fully_connected_compressed3 = wrap_type({data, any_input(), any_input(), any_input()}); + auto fully_connected_compressed4 = wrap_type({data, any_input(), any_input(), any_input(), any_input()}); auto fully_connected_compressed = std::make_shared(OutputVector{fully_connected_compressed3, fully_connected_compressed4}); @@ -65,12 +52,20 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size ov::op::internal::DynamicQuantize::Attributes config; config.quantization_dt = element::i8; - config.quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Symmetric; + config.quantization_type = QuantizationType::Symmetric; config.scale_dt = element::f16; config.group_sizes = shape_group_size; + if (debug_config->dynamic_quantize_asym) { + config.quantization_type = QuantizationType::Asymmetric; + config.quantization_dt = element::u8; + config.zp_dt = element::u8; // it supports u8 only now + } + auto dyn_quan = std::make_shared(m_data, config); auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); + auto optional_a_zp = config.quantization_type == QuantizationType::Symmetric ? + std::make_shared() : dyn_quan->output(2); auto output_type = m_fc->get_output_type(); if (output_type == ov::element::undefined) @@ -82,6 +77,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size m_fc->get_input_node_shared_ptr(3), optional_w_zp, dyn_quan->output(1), + optional_a_zp, output_type); ov::replace_node(m_fc, new_fc); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp index 2e3819d7e850ee..dd5c555b1e6bc8 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp @@ -14,11 +14,13 @@ FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& A, const ov::Output& w_decompression_scale, const ov::Output& w_decompression_zero_point, const ov::Output& a_decompression_scale, + const ov::Output& a_decompression_zero_point, const ov::element::Type output_type) : FullyConnected(A, B, bias, output_type) { set_argument(3, w_decompression_scale); set_argument(4, w_decompression_zero_point); set_argument(5, a_decompression_scale); + set_argument(6, a_decompression_zero_point); validate_and_infer_types(); } @@ -60,12 +62,13 @@ std::shared_ptr FullyConnectedCompressed::clone_with_new_inputs(const new_args.at(3), new_args.at(4), m_output_type); - else if (new_args.size() == 6) + else if (new_args.size() == 7) return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), new_args.at(4), + new_args.at(5), new_args.at(6), m_output_type); else diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index e47ccbb09a9c43..50eecf51b945b7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -975,18 +975,34 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass manager.register_pass(); - auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); if (device_info.supports_immad) { + auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); pass_config->set_callback([=](const_node_ptr& root) -> bool { if (root->get_input_node_shared_ptr(0)->get_element_type() == ov::element::Type_t::f32) { - GPU_DEBUG_TRACE << root->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; + GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: input type is not supported" << std::endl; return true; } auto weight_shape = root->get_input_partial_shape(1); const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length(); if (innermost_size < 32) { - GPU_DEBUG_TRACE << "Dynamic quantization: shape is too small " << innermost_size << " / " << dynamic_quantization_group_size << std::endl; + GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: shape is too small - " << innermost_size << std::endl; + return true; + } + + // AZP does not support 8bit weight + if (debug_config->dynamic_quantize_asym + && (root->get_input_element_type(1) == ov::element::i8 || root->get_input_element_type(1) == ov::element::u8)) { + GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: asym quantization does not support 8bit weight" << std::endl; + return true; + } + + bool has_wzp = root->get_input_size() > 4; + if ((root->get_input_element_type(1) == ov::element::i8 || root->get_input_element_type(1) == ov::element::u8) + && has_wzp + && dynamic_quantization_group_size != UINT64_MAX) { + GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off:" + " asym 8bit weight does not support grouped quantization" << std::endl; return true; } return false; diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 65ca31f16c720c..380480dccc68bf 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -190,6 +190,7 @@ static void print_help_messages() { "separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*"); message_list.emplace_back("OV_GPU_DynamicQuantizeGroupSize", "Specify a group size of dynamic quantization to enable " "dynamic quantization for Fully-connected primitive."); + message_list.emplace_back("OV_GPU_DynamicQuantizeAsym", "Enable asymmetric dynamic quantization when set as 1."); message_list.emplace_back("OV_GPU_DisableHorizontalFCFusion", "Disable horizontal fc fusion"); message_list.emplace_back("OV_GPU_DisableFCSwigluFusion", "Disable fc + swiglu fusion"); message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space."); @@ -260,6 +261,7 @@ debug_configuration::debug_configuration() , use_usm_host(0) , use_kv_cache_compression(-1) , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) + , dynamic_quantize_asym(0) , disable_horizontal_fc_fusion(0) , disable_fc_swiglu_fusion(0) { #ifdef GPU_DEBUG_CONFIG @@ -315,6 +317,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("UseUsmHost", use_usm_host); get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression); get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size); + get_gpu_debug_env_var("DynamicQuantizeAsym", dynamic_quantize_asym); get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion); get_gpu_debug_env_var("DisableFCSwigluFusion", disable_fc_swiglu_fusion); std::string dump_iteration_str; diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 30a9477e1600dd..804ad81f2d3735 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -57,7 +57,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::internal::query_model_ratio, 1.0f), std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED), std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}), - std::make_tuple(ov::hint::dynamic_quantization_group_size, 32), + std::make_tuple(ov::hint::dynamic_quantization_group_size, 0), std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), @@ -254,6 +254,11 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) { set_property(ov::hint::kv_cache_precision(ov::element::i8)); } + // Enable dynamic quantization by default for non-systolic platforms + if (!is_set_by_user(ov::hint::dynamic_quantization_group_size) && !info.supports_immad) { + set_property(ov::hint::dynamic_quantization_group_size(32)); + } + user_properties.clear(); } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp index 27c57aa072878d..b430884decb71a 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp @@ -58,7 +58,8 @@ using MatmulWeightsDecompressionParams = std::tuple; class MatmulWeightsDecompression : public testing::WithParamInterface, @@ -74,6 +75,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface(dyn_input_ps.size(), 1); - group_sizes.back() = UINT64_MAX; + group_sizes.back() = group_size; - auto input_data = rg.generate_random_1d(ov::shape_size(data_shape), -16.0f, 16.0f); + auto input_data = rg.generate_random_1d(ov::shape_size(data_shape), -16.0f, 20.0f); set_values(input_mem, input_data); auto in_layout_f32 = input_shape.is_dynamic() ? layout{ dyn_input_ps, data_types::f32, format::bfyx } @@ -53,17 +58,15 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { dynamic_quantize::Attributes dq_config; dq_config.quantization_type = quantization_type; - dq_config.quantization_dt = data_types::i8; + dq_config.quantization_dt = quant_dt; dq_config.scale_dt = data_types::f16; - dq_config.zp_dt = data_types::undefined; + dq_config.zp_dt = zp_dt; dq_config.group_sizes = group_sizes; - dq_config.scales_zp_output_order = { 0, 1, 2, 3 }; - dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + dq_config.scales_zp_output_order = { 0, 1, 2}; - if (quantization_type == QuantizationType::Asymmetric) { - dq_config.zp_dt = data_types::f16; - dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; - } + if (data_shape.size() == 4) + dq_config.scales_zp_output_order.emplace_back(3); + dq_config.output_storage_type = storage_type; auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx }); auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), dq_config); @@ -156,6 +159,19 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch) { this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {1, 1, 1, 4096}); } +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_asym_act) { + this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {1, 1, 1, 4096}, QuantizationType::Asymmetric, UINT64_MAX, + data_types::u8, data_types::u8, OutputStorageType::Planar); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_grouped) { + this->test_dynamic_quantization(false, {1, 1, 4096}, {64, 1, 4096}, QuantizationType::Symmetric, 32); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch_grouped) { + this->test_dynamic_quantization(false, {-1, 1, 4096}, {1, 1, 4096}, QuantizationType::Symmetric, 32); +} + TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only) { this->test_dynamic_quantization(false, {-1, 1, 1, 33}, {16, 1, 1, 33}); } @@ -177,33 +193,36 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned_dynamic) { } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache) { - this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Symmetric, UINT64_MAX, + data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched) { - this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Symmetric, UINT64_MAX, + data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered) { - this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Symmetric, UINT64_MAX, + data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered) { - this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Symmetric, UINT64_MAX, + data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym) { - this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Asymmetric, UINT64_MAX, + data_types::i8, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym) { - this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Asymmetric, UINT64_MAX, + data_types::i8, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym) { - this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); -} - -TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym) { - this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Asymmetric, UINT64_MAX, + data_types::i8, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 6bf44a31add0f4..f59dc5c42cffc1 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1555,7 +1555,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); - config.set_property(ov::hint::dynamic_quantization_group_size(32)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(32)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); @@ -1643,7 +1643,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1669,7 +1669,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); @@ -1753,7 +1753,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1780,9 +1780,9 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); if (is_dyn_quan) { - config.set_property(ov::hint::dynamic_quantization_group_size(32)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(32)); } else { - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); } network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); @@ -1923,7 +1923,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl = { in_layout.format, "", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim1", fc_impl }, { "fc_prim2", fc_impl } })); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1952,7 +1952,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); @@ -2905,7 +2905,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topo, config); network.set_input_data("input", input_mem); @@ -2931,7 +2931,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); - config.set_property(ov::hint::dynamic_quantization_group_size(quantize_group_size)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(quantize_group_size)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), false); @@ -3031,7 +3031,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); - config.set_property(ov::hint::dynamic_quantization_group_size(0)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topo, config); network.set_input_data("input", input_mem); @@ -3057,7 +3057,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); - config.set_property(ov::hint::dynamic_quantization_group_size(quantize_group_size)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(quantize_group_size)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), false); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp index fb30222998008b..3384fb1ed514f6 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp @@ -71,11 +71,11 @@ class check_hash_value: public ::testing::Test { const auto primitive_hash = primitve->hash(); const auto params_hash = primitve->type->get_fake_aligned_params(*prim_inst->get_impl_params()).hash(); if (!engine.get_device_info().supports_immad) { - ASSERT_EQ(primitive_hash, 8017451717095756666UL); - ASSERT_EQ(params_hash, 8889154389021912103UL); + ASSERT_EQ(primitive_hash, 9510988594087947885UL); + ASSERT_EQ(params_hash, 7833603199176871790UL); } else { - ASSERT_EQ(primitive_hash, 8017451717095756666UL); - ASSERT_EQ(params_hash, 10847775446937354749UL); + ASSERT_EQ(primitive_hash, 9510988594087947885UL); + ASSERT_EQ(params_hash, 16259702189938020305UL); } } From a3f4edb3d8f12769c7ae7d39206730502fae711f Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Mon, 9 Dec 2024 14:47:37 +0900 Subject: [PATCH 12/23] [GPU] Fix crash on swiglu fused case (due to outer_ofm == 1) (#27972) ### Details: - fixed crash happens in minicpm-1b-sft int4 model ### Tickets: - *ticket-id* --- .../fully_connected_kernel_bf_tiled.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 68da7aea7b1fe6..d0f881adcd88b1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -435,10 +435,14 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT)); } } else if (is_weight_small_kn(params, output_f)) { - if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) - return selector.Default(tune_params(1, 1, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); - else + if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { + if (swiglu_fused) + return selector.Default(tune_params(1, 1, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT)); + else + return selector.Default(tune_params(1, 1, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + } else { return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + } } else { if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT)); @@ -865,7 +869,9 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa auto output_f = get_output_aligned_bf_size(fc_params, false).second; WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16; - if (!is_swiglu_fused(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 + if (is_swiglu_fused(fc_params)) { + weights_layout = WeightsLayout::os_is_yx_osv32_isv2; + } else if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) && is_weight_horizontal(fc_params, output_f)) { From 27138a8af6b9cd8e79b394ab5b56b4c61fd7deba Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Mon, 9 Dec 2024 07:40:37 +0100 Subject: [PATCH 13/23] [DOCS] saveModelSync method in Node.js addon (#27960) Porting: #27958 Signed-off-by: sgolebiewski-intel --- docs/sphinx_setup/api/nodejs_api/addon.rst | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/docs/sphinx_setup/api/nodejs_api/addon.rst b/docs/sphinx_setup/api/nodejs_api/addon.rst index f6ee4ab7b15836..7c42824bcd88a3 100644 --- a/docs/sphinx_setup/api/nodejs_api/addon.rst +++ b/docs/sphinx_setup/api/nodejs_api/addon.rst @@ -49,6 +49,7 @@ The **openvino-node** package exports ``addon`` which contains the following pro resizeAlgorithm: typeof resizeAlgorithm; PrePostProcessor: PrePostProcessorConstructor; }; + saveModelSync(model: Model, path: string, compressToFp16?: boolean): void; element: typeof element; } @@ -142,3 +143,39 @@ Properties - **Defined in:** `addon.ts:674 `__ + +.. rubric:: saveModelSync + +* + + .. code-block:: ts + + saveModelSync(model: Model, path: string, compressToFp16?: boolean): void; + + + This method saves a model to IR (xml and bin files), applying all + necessary transformations that are usually added during model conversion. + Particularly, weights are compressed to FP16 by default, and debug information + in model nodes is cleaned up. + + * **Parameters:** + + - model: :doc:`Model ` + + A model which will be converted to IR and saved. + + - path: string + + A path for saving the model. + + - ``Optional`` + + - compressToFp16: boolean + + Compression of weights to FP16 floating point precision. The default value is `true` . + + * **Returns:** void + + * **Defined in:** + `addon.ts:692 `__ + From 15a9b617fcfd591a14daf632cdeecbe99255bd64 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Mon, 9 Dec 2024 12:33:16 +0400 Subject: [PATCH 14/23] [TF FE] Run If tests on all platforms (#27966) **Details:** Run If tests on all platforms **Ticket:** TBD --------- Signed-off-by: Kazantsev, Roman --- .../tensorflow_tests/test_tf_If.py | 44 ++++++++----------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_If.py b/tests/layer_tests/tensorflow_tests/test_tf_If.py index 67686ef53a5750..21dee5aa28616d 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_If.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_If.py @@ -1,13 +1,13 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest +rng = np.random.default_rng(32345) + class TestIfFloat(CommonTFLayerTest): def _prepare_input(self, inputs_info): @@ -18,9 +18,9 @@ def _prepare_input(self, inputs_info): x_shape = inputs_info['x:0'] y_shape = inputs_info['y:0'] inputs_data = {} - inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool) - inputs_data['x:0'] = np.random.randint(1, 10, x_shape).astype(np.float32) - inputs_data['y:0'] = np.random.randint(-50, 50, y_shape).astype(np.float32) + inputs_data['cond:0'] = rng.integers(0, 2, cond_shape).astype(bool) + inputs_data['x:0'] = rng.integers(1, 10, x_shape).astype(np.float32) + inputs_data['y:0'] = rng.integers(-50, 50, y_shape).astype(np.float32) return inputs_data def create_if_net(self, x_shape, y_shape, lower_control_flow): @@ -69,12 +69,10 @@ def else_branch(): @pytest.mark.parametrize("params", test_data_basic) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', - reason='Ticket - 122716') def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU': - pytest.xfail('104855') + pytest.xfail('104855: If operation is not supported by GPU') self._test(*self.create_if_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) @@ -89,9 +87,9 @@ def _prepare_input(self, inputs_info): ind_shape = inputs_info['ind:0'] y_shape = inputs_info['y:0'] inputs_data = {} - inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool) - inputs_data['ind:0'] = np.random.randint(1, 10, ind_shape).astype(np.int32) - inputs_data['y:0'] = np.random.randint(-50, 50, y_shape).astype(np.float32) + inputs_data['cond:0'] = rng.integers(0, 2, cond_shape).astype(bool) + inputs_data['ind:0'] = rng.integers(1, 10, ind_shape).astype(np.int32) + inputs_data['y:0'] = rng.integers(-50, 50, y_shape).astype(np.float32) return inputs_data def create_if_net(self, ind_shape, y_shape, lower_control_flow): @@ -141,12 +139,10 @@ def else_branch(): @pytest.mark.parametrize("params", test_data_basic) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', - reason='Ticket - 122716') def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU': - pytest.xfail('104855') + pytest.xfail('104855: If operation is not supported by GPU') self._test(*self.create_if_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) @@ -161,9 +157,9 @@ def _prepare_input(self, inputs_info): y_shape = inputs_info['y:0'] z_shape = inputs_info['z:0'] inputs_data = {} - inputs_data['x:0'] = np.random.randint(0, 6, x_shape).astype(np.int32) - inputs_data['y:0'] = np.random.randint(1, 10, y_shape).astype(np.float32) - inputs_data['z:0'] = np.random.randint(-50, 50, z_shape).astype(np.float32) + inputs_data['x:0'] = rng.integers(0, 6, x_shape).astype(np.int32) + inputs_data['y:0'] = rng.integers(1, 10, y_shape).astype(np.float32) + inputs_data['z:0'] = rng.integers(-50, 50, z_shape).astype(np.float32) return inputs_data def create_if_net(self, y_shape, z_shape, lower_control_flow): @@ -221,12 +217,10 @@ def else_branch(): @pytest.mark.parametrize("params", test_data_basic) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', - reason='Ticket - 122716') def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU': - pytest.xfail('104855') + pytest.xfail('104855: If operation is not supported by GPU') self._test(*self.create_if_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) @@ -241,9 +235,9 @@ def _prepare_input(self, inputs_info): x_shape = inputs_info['x:0'] y_shape = inputs_info['y:0'] inputs_data = {} - inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool) - inputs_data['x:0'] = np.random.randint(1, 10, x_shape).astype(np.float32) - inputs_data['y:0'] = np.random.randint(-50, 50, y_shape).astype(np.float32) + inputs_data['cond:0'] = rng.integers(0, 2, cond_shape).astype(bool) + inputs_data['x:0'] = rng.integers(1, 10, x_shape).astype(np.float32) + inputs_data['y:0'] = rng.integers(-50, 50, y_shape).astype(np.float32) return inputs_data def create_sequential_ifs_net(self, x_shape, y_shape, lower_control_flow): @@ -313,12 +307,10 @@ def else_branch(): @pytest.mark.parametrize("params", test_data_basic) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', - reason='Ticket - 122716') def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU': - pytest.xfail('104855') + pytest.xfail('104855: If operation is not supported by GPU') self._test(*self.create_sequential_ifs_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) From 408a5e065200b1fcb41200f9361094fa1c7df5d7 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Mon, 9 Dec 2024 17:49:45 +0900 Subject: [PATCH 15/23] [GPU] update onednn to latest 3.7-pc (#27811) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 0f269193c74663..36e090a367a431 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 0f269193c7466313888d3338209d0d06a22cc6fa +Subproject commit 36e090a367a4312a1caa2db9e95fb94d17d7573b From de949b4a2b59faf1bf701528dd37b7ecd076d4e0 Mon Sep 17 00:00:00 2001 From: Yuan Hu Date: Mon, 9 Dec 2024 17:08:40 +0800 Subject: [PATCH 16/23] [CPU] enable brdgmm kernel in CPU plugin (#27589) ### Details: - *replace impl string brdgmm with brgconv* - *add test case* - *remove skip CVS-56143 config, CVS-56143 is already closed* - *remove skip CVS-53578 config, CVS-53578 is already closed* - *use new ticket CVS-157596 to track leftover test case* ### Tickets: - *CVS-156792* --------- Signed-off-by: HU Yuan2 --- src/plugins/intel_cpu/src/nodes/conv.cpp | 13 +- .../intel_cpu/src/onednn/iml_type_mapper.cpp | 3 + .../intel_cpu/src/onednn/iml_type_mapper.h | 3 + .../single_layer_tests/group_convolution.cpp | 126 +++++++++++++++++- .../skip_tests_config.cpp | 10 +- 5 files changed, 140 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 7cf7698e989343..53d53d093cfabf 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -343,6 +343,7 @@ const std::vector& Convolution::getDefaultImplPriority() { impl_desc_type::winograd_acl, impl_desc_type::gemm_acl, impl_desc_type::acl, + impl_desc_type::brgconv_avx512_dw, impl_desc_type::brgconv_avx512_amx_1x1, impl_desc_type::brgconv_avx512_amx, impl_desc_type::jit_avx512_amx_dw, @@ -353,6 +354,7 @@ const std::vector& Convolution::getDefaultImplPriority() { impl_desc_type::jit_avx512_dw, impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, + impl_desc_type::brgconv_avx2_dw, impl_desc_type::brgconv_avx2_1x1, impl_desc_type::brgconv_avx2, impl_desc_type::jit_uni_dw, @@ -815,7 +817,11 @@ void Convolution::initSupportedPrimitiveDescriptors() { #endif for (size_t dIdx = 0; dIdx < descs.size(); dIdx++) { auto& desc = descs[dIdx]; - auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + auto primitive_desc = desc.get(true); //true mean allow empty + if (primitive_desc == nullptr) { + continue; + } + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(primitive_desc)); auto add_supported_desc = [&](dnnl::primitive_desc& desc) { addSupportedPrimitiveDescriptor(desc); @@ -823,7 +829,7 @@ void Convolution::initSupportedPrimitiveDescriptors() { }; const bool first_match = customImplPriorities.empty(); - DEBUG_LOG("#", getName(), + DEBUG_LOG("#", getName(), ",descIndex:", dIdx + 1, "/", descs.size(), ", itpd.impl_info_str(): ", desc.impl_info_str(), ", parsed imp_type: ", impl_type_to_string(parse_impl_name(desc.impl_info_str())), ", first_match: ", first_match ? "true" : "false"); @@ -944,8 +950,7 @@ void Convolution::createDescriptor(const std::vector& inputDesc, const auto desc = createDescriptorInternal(getEngine(), inDnnlDesc, weightDnnlDesc, biasDnnlDesc, outDnnlDesc, withBiases, stride, dilation, paddingL, paddingR, alg, attr); - if (desc) - descs.emplace_back(desc); + descs.emplace_back(desc); } } } diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp index d7a1e5979ddad9..5c57a94f69f67d 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp @@ -17,6 +17,7 @@ impl_desc_type parse_impl_name(std::string impl_desc_name) { if (pos != std::string::npos) impl_desc_name.replace(pos, std::string(#_wrd).length(), #_sub); } // Replace the ONEDNN pd name with OV definition. REPLACE_WORD(brg_conv, brgconv); + REPLACE_WORD(brdgmm, brgconv); REPLACE_WORD(avx10_1_512, avx512); REPLACE_WORD(brg_matmul, brgemm); @@ -119,6 +120,8 @@ const char* impl_type_to_string(impl_desc_type type) { CASE(brgconv_sse42_1x1); CASE(brgconv_uni_1x1); CASE(brgconv_avx512_amx_1x1); + CASE(brgconv_avx512_dw); + CASE(brgconv_avx2_dw); CASE(brgemm_avx512); CASE(brgemm_avx2); CASE(brgemm_avx); diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h index 3fd79716c7cd72..45a71bdb88dd33 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h @@ -98,6 +98,9 @@ enum impl_desc_type : int64_t { brgconv_uni_1x1 = brgconv | uni | _1x1, brgconv_avx512_amx_1x1 = brgconv | avx512 | amx | _1x1, + brgconv_avx2_dw = brgconv_avx2 | _dw, + brgconv_avx512_dw = brgconv_avx512 | _dw, + brgemm_avx512 = brgemm | avx512, brgemm_avx2 = brgemm | avx2, brgemm_avx = brgemm | avx, diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp index 47d7d3072b7337..f3f5b1f2e07975 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp @@ -5,6 +5,7 @@ #include "shared_test_classes/single_op/group_convolution.hpp" #include "common_test_utils/node_builders/group_convolution.hpp" +#include "openvino/runtime/system_conf.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "utils/convolution_params.hpp" #include "utils/cpu_test_utils.hpp" @@ -176,14 +177,15 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface()) { - selectedType += "_bf16"; - rel_threshold = 1e-2f; - } else { - selectedType = makeSelectedTypeStr(selectedType, netType); + const auto& it = configuration.find(ov::hint::inference_precision.name()); + if (it != configuration.end()) { + if (ov::element::bf16 == it->second.as()) { + rel_threshold = 1e-2f; + } else if (ov::element::f16 == it->second.as()) { + rel_threshold = 0.00125f; + } } + selectedType = makeSelectedTypeStr(selectedType, deduce_expected_precision(netType, configuration)); // according to range propagation feature, resolution of generated inputs data for parameters moved from 32 to 32768 // 'real' part of input data was changed and some fails became visible for cases with Elu and FakeQuantize, so let's setup abs_threshold @@ -289,6 +291,7 @@ std::vector filterCPUInfoForDeviceSupportBF16(std::vector fusingParamsSetBF16{emptyFusingSpec, // sum fusingSum}; +const std::vector fusingParamsSet_Brdgmm{emptyFusingSpec, + // eltwise + fusingRelu, + fusingPRelu1D, + // depthwise + fusingReluScaleShift, + // fake quantize + fusingFakeQuantizePerTensorRelu, + fusingFakeQuantizePerChannelRelu + // sum + // comment out sum due to MFDNN-12841 + //fusingSumEluFQ, + //fusingSum + }; + +const std::vector fusingParamsSetBF16_Brdgmm{emptyFusingSpec, + // eltwise + fusingRelu, + // depthwise + fusingReluScaleShift + // sum + // comment out sum due to MFDNN-12841 + //fusingSum + }; + +const std::vector fusingParamsSetFP16_Brdgmm = fusingParamsSetBF16_Brdgmm; + /* ============= GroupConvolution params (planar layout) ============= */ const std::vector numOutChannels_Gemm = {6}; const std::vector numGroups_Gemm = {2, 3}; @@ -1299,6 +1329,38 @@ INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_FP32, ::testing::Values(empty_plugin_config)), GroupConvolutionLayerCPUTest::getTestCaseName); +const std::vector> dilations2d_Brdgmm = {{1, 1}}; +const auto groupConvParams_ExplicitPadding_DW_2D_Brdgmm = ::testing::Combine(::testing::ValuesIn(kernels2d), + ::testing::ValuesIn(strides2d), + ::testing::ValuesIn(padBegins2d), + ::testing::ValuesIn(padEnds2d), + ::testing::ValuesIn(dilations2d_Brdgmm), + ::testing::ValuesIn(numOutChannels_DW), + ::testing::ValuesIn(numGroups_DW), + ::testing::Values(ov::op::PadType::EXPLICIT)); +const auto BrdgmmCPUSpec = []()-> std::vector { + std::string isaStr; + if (ov::with_cpu_x86_avx512f()) { + isaStr = "avx512"; + } else { + isaStr = "avx2"; + } + return {CPUSpecificParams{{}, {}, {}, "brgconv_" + isaStr + "_dw"}}; +}; + +INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_FP32_Brdgmm, + GroupConvolutionLayerCPUTest, + ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D_Brdgmm, + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inputShapes2dDW), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUInfoForDevice(BrdgmmCPUSpec())), + ::testing::ValuesIn(fusingParamsSet_Brdgmm), + ::testing::Values(empty_plugin_config)), + GroupConvolutionLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_BF16, GroupConvolutionLayerCPUTest, ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D, @@ -1313,6 +1375,32 @@ INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_BF16, ::testing::Values(cpu_bf16_plugin_config)), GroupConvolutionLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_BF16_Brdgmm, + GroupConvolutionLayerCPUTest, + ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D_Brdgmm, + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inputShapes2dDW), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUInfoForDeviceSupportBF16(BrdgmmCPUSpec())), + ::testing::ValuesIn(fusingParamsSetBF16_Brdgmm), + ::testing::Values(cpu_bf16_plugin_config)), + GroupConvolutionLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_FP16_Brdgmm, + GroupConvolutionLayerCPUTest, + ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D_Brdgmm, + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inputShapes2dDW), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUInfoForDevice(BrdgmmCPUSpec())), + ::testing::ValuesIn(fusingParamsSetFP16_Brdgmm), + ::testing::Values(cpu_f16_plugin_config)), + GroupConvolutionLayerCPUTest::getTestCaseName); + /* ============= GroupConvolution (DW 3D) ============= */ const auto groupConvParams_ExplicitPadding_DW_3D = ::testing::Combine(::testing::ValuesIn(kernels3d), ::testing::ValuesIn(strides3d), @@ -1349,6 +1437,30 @@ INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_3D_DW_FP32, ::testing::ValuesIn(fusingParamsSet), ::testing::Values(empty_plugin_config)), GroupConvolutionLayerCPUTest::getTestCaseName); + +const std::vector> dilations3d_Brdgmm = {{1, 1, 1}}; +const auto groupConvParams_ExplicitPadding_DW_3D_Brdgmm = ::testing::Combine(::testing::ValuesIn(kernels3d), + ::testing::ValuesIn(strides3d), + ::testing::ValuesIn(padBegins3d), + ::testing::ValuesIn(padEnds3d), + ::testing::ValuesIn(dilations3d_Brdgmm), + ::testing::ValuesIn(numOutChannels_DW), + ::testing::ValuesIn(numGroups_DW), + ::testing::Values(ov::op::PadType::EXPLICIT)); + +INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_3D_DW_FP32_Brdgmm, + GroupConvolutionLayerCPUTest, + ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_3D_Brdgmm, + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inputShapes3dDW), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUInfoForDevice(BrdgmmCPUSpec())), + ::testing::ValuesIn(fusingParamsSet_Brdgmm), + ::testing::Values(empty_plugin_config)), + GroupConvolutionLayerCPUTest::getTestCaseName); + /* ========= */ /* ============= SINGLE TEST CASES ============= */ diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index b675a7c2da7d42..089a03b4d6bba7 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -40,10 +40,12 @@ std::vector disabledTestPatterns() { R"(.*BinaryConvolutionLayerTest.*)", // TODO: 53618. BF16 gemm ncsp convolution crash R"(.*_GroupConv.*_inFmts=nc.*_primitive=jit_gemm.*ENFORCE_BF16=YES.*)", - // TODO: 53578. fork DW bf16 convolution does not support 3d cases yet - R"(.*_DW_GroupConv.*_inFmts=(ndhwc|nCdhw16c).*ENFORCE_BF16=YES.*)", - // TODO: 56143. Enable nspc convolutions for bf16 precision - R"(.*ConvolutionLayerCPUTest.*_inFmts=(ndhwc|nhwc).*INFERENCE_PRECISION_HINT=bf16.*)", + // TODO: 157596 convolution bf16 leftover test case + R"(smoke_JIT_AVX512_DW_GroupConv/GroupConvolutionLayerCPUTest.*ndhwc.*jit_avx512_dw.*INFERENCE_PRECISION_HINT=bf16.*)", + R"(smoke_Conv_1D_1x1_BF16/ConvolutionLayerCPUTest\.CompareWithRefs/IS=\[\]_TS=\(\((1|2)\.6(4|7)\.7\)_\)_K\(1\)_S\(1\)_PB\(0\)_PE\(0\)_D=\(1\)_O=63_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=jit_avx512_1x1_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)", + R"(smoke_Conv_1D_1x1_BF16/ConvolutionLayerCPUTest\.CompareWithRefs/IS=\[1\.\.200\.64\.\?\]_TS=\(\(2\.64\.7\)_\(1\.64\.5\)_\)_K\(1\)_S\(1\)_PB\(0\)_PE\(0\)_D=\(1\)_O=63_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=jit_avx512_1x1_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)", + R"(smoke_Conv_1D_1x1_BF16/ConvolutionLayerCPUTest\.CompareWithRefs/IS=\[\?\.6(4|7)\.1\.\.200\]_TS=\(\(2\.6(4|7)\.7\)_\(1\.6(4|7)\.9\)_\)_K\(1\)_S\(1\)_PB\(0\)_PE\(0\)_D=\(1\)_O=63_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=jit_avx512_1x1_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)", + R"(smoke_GroupConv_brgemm_2D_BF16/GroupConvolutionLayerCPUTest\.CompareWithRefs/IS=\[\]_TS=\(\(1\.64\.7\.7\)_\)_K\(3\.3\)_S\(2\.2\)_PB\((0|1)\.(0|1)\)_PE\(0\.0\)_D=\(2\.2\)_O=64_G=2_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=brgconv_avx512_amx_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)", // TODO: 56827. Sporadic test failures R"(.*smoke_Conv.+_FP32.ConvolutionLayerCPUTest\.CompareWithRefs.*TS=\(\(.\.67.+\).*inFmts=n.+c.*_primitive=jit_avx2.*)", // incorrect jit_uni_planar_convolution with dilation = {1, 2, 1} and output channel 1 From de776f279c87e542c640acc8140aaf87f278c991 Mon Sep 17 00:00:00 2001 From: Andrei Kashchikhin Date: Mon, 9 Dec 2024 09:27:11 +0000 Subject: [PATCH 17/23] [CI] [GHA] Introduce additional Python (3.9-3.12) API tests on macOS (#27666) ### Details: - Based on #27304, should be reviewed after it. ### Tickets: - *152690* --- .github/workflows/job_python_api_tests.yml | 142 ++++++++++++++++++++ .github/workflows/job_python_unit_tests.yml | 54 ++------ .github/workflows/job_samples_tests.yml | 14 +- .github/workflows/linux_arm64.yml | 10 ++ .github/workflows/mac.yml | 60 ++++++++- .github/workflows/mac_arm64.yml | 57 +++++++- .github/workflows/ubuntu_22.yml | 10 ++ .github/workflows/ubuntu_24.yml | 10 ++ 8 files changed, 304 insertions(+), 53 deletions(-) create mode 100644 .github/workflows/job_python_api_tests.yml diff --git a/.github/workflows/job_python_api_tests.yml b/.github/workflows/job_python_api_tests.yml new file mode 100644 index 00000000000000..541a14e2b1b6df --- /dev/null +++ b/.github/workflows/job_python_api_tests.yml @@ -0,0 +1,142 @@ +name: Python API tests + +on: + workflow_call: + inputs: + runner: + description: 'Machine on which the tests would run' + type: string + required: true + container: + description: 'JSON to be converted to the value of the "container" configuration for the job' + type: string + required: false + default: '{"image": null}' + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true + +permissions: read-all + +env: + PIP_CACHE_PATH: /mount/caches/pip/linux + +jobs: + Python_Unit_Tests: + name: Python API tests + timeout-minutes: 30 + runs-on: ${{ inputs.runner }} + container: ${{ fromJSON(inputs.container) }} + defaults: + run: + shell: bash + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + OPENVINO_REPO: ${{ github.workspace }}/openvino + INSTALL_DIR: ${{ github.workspace }}/install + INSTALL_TEST_DIR: ${{ github.workspace }}/install/openvino_tests + INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/openvino_wheels + steps: + - name: Download OpenVINO artifacts (tarballs and wheels) + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + pattern: openvino_@(wheels|tests) + path: ${{ env.INSTALL_DIR }} + + # Needed as ${{ github.workspace }} is not working correctly when using Docker + - name: Setup Variables + run: | + echo "OPENVINO_REPO=$GITHUB_WORKSPACE/openvino" >> "$GITHUB_ENV" + echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" + echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/openvino_tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/openvino_wheels" >> "$GITHUB_ENV" + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages + run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_TEST_DIR} + working-directory: ${{ env.INSTALL_TEST_DIR }} + + - name: Fetch setup_python and install wheels actions + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + with: + sparse-checkout: | + .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml + sparse-checkout-cone-mode: false + path: 'action_root' + + - name: Setup Python ${{ inputs.python-version }} + uses: ./action_root/.github/actions/setup_python + with: + version: ${{ inputs.python-version }} + pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} + should-setup-pip-paths: ${{ runner.os == 'Linux' }} + self-hosted-runner: ${{ runner.os == 'Linux' }} + + # + # Tests + # + - name: Install OpenVINO Python wheels + uses: ./action_root/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + + - name: Install Python API tests dependencies + run: python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt + + # + # Tests + # + + - name: Python API Tests + run: | + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \ + --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ + --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py + + - name: Python API Tests -- numpy>=2.0.0 + run: | + python3 -m pip uninstall -y numpy + python3 -m pip install "numpy~=2.0.0" + python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \ + --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph_new_numpy.xml \ + --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py + + - name: Clone API snippets + if: runner.os != 'macOS' + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + with: + sparse-checkout: docs/articles_en/assets/snippets + path: ${{ env.OPENVINO_REPO }} + submodules: 'false' + + - name: Docs Python snippets + if: runner.os != 'macOS' + run: | + # torch, onnx + python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/python/preprocess/torchvision/requirements.txt -r ${INSTALL_TEST_DIR}/tests/requirements_onnx + # to find 'snippets' module in docs + export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py + + - name: Upload Test Results + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + if: ${{ !cancelled() }} + with: + name: test-results-python-api-${{ inputs.python-version }} + path: | + ${{ env.INSTALL_TEST_DIR }}/TEST*.html + ${{ env.INSTALL_TEST_DIR }}/TEST*.xml + if-no-files-found: 'warn' diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml index 8075f3299fe063..47506c83bf0945 100644 --- a/.github/workflows/job_python_unit_tests.yml +++ b/.github/workflows/job_python_unit_tests.yml @@ -65,21 +65,22 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" - name: Install OpenVINO dependencies (mac) if: runner.os == 'macOS' run: brew install pigz - name: Extract OpenVINO packages - run: | - pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} + run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Fetch setup_python action + - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml sparse-checkout-cone-mode: false path: 'action_root' @@ -92,11 +93,10 @@ jobs: self-hosted-runner: ${{ runner.os == 'Linux' }} - name: Install OpenVINO Python wheels - run: | - # Install the core OV wheel - python3 -m pip install ./openvino-*.whl - - working-directory: ${{ env.INSTALL_WHEELS_DIR }} + uses: ./action_root/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' - name: Install Python API tests dependencies run: | @@ -121,15 +121,6 @@ jobs: # Tests # - - name: Python API Tests - if: ${{ fromJSON(inputs.affected-components).Python_API.test }} - run: | - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \ - --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ - --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py - - name: Python ONNX operators tests if: (fromJSON(inputs.affected-components).Python_API.test || fromJSON(inputs.affected-components).ONNX_FE.test) && @@ -185,35 +176,6 @@ jobs: TEST_DEVICE: CPU TEST_PRECISION: FP16 - - name: Clone API snippets - if: runner.os != 'macOS' - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - sparse-checkout: docs/articles_en/assets/snippets - path: ${{ env.OPENVINO_REPO }} - submodules: 'false' - - - name: Docs Python snippets - if: runner.os != 'macOS' - run: | - # to find 'snippets' module in docs - export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py - - - name: Python API Tests -- numpy>=2.0.0 - if: ${{ fromJSON(inputs.affected-components).Python_API.test }} - run: | - python3 -m pip uninstall -y numpy - python3 -m pip install "numpy>=2.0.0,<2.2.0" - python3 -m pip install -r ${INSTALL_TEST_DIR}/bindings/python/requirements_test.txt - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \ - --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ - --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py - - name: Upload Test Results uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 if: ${{ !cancelled() }} diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml index e144aa0cfb95aa..6f95d316abfc3f 100644 --- a/.github/workflows/job_samples_tests.yml +++ b/.github/workflows/job_samples_tests.yml @@ -54,6 +54,7 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" - name: Install OpenVINO dependencies (mac) if: runner.os == 'macOS' @@ -65,13 +66,12 @@ jobs: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Fetch setup_python action - # Python is already installed on Ubuntu within Dockerfile - if: runner.os != 'Linux' + - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml sparse-checkout-cone-mode: false path: 'openvino' @@ -113,6 +113,12 @@ jobs: # Tests # + - name: Install OpenVINO Python wheels + uses: ./openvino/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + - name: Samples tests if: fromJSON(inputs.affected-components).samples.test run: | @@ -122,7 +128,7 @@ jobs: export SHARE=$INSTALL_TEST_DIR/smoke_tests/samples_smoke_tests_data # Install Python benchmark_app by installing openvino-*.whl - python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt $INSTALL_WHEELS_DIR/openvino-*.whl + python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt export LD_LIBRARY_PATH=${IE_APP_PATH}:$LD_LIBRARY_PATH source ${INSTALL_DIR}/setupvars.sh diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 66ce9461f05fe8..e1aaa886d631c7 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -169,6 +169,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-16-cores-arm' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' + python-version: '3.11' + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index c587c5ad7323b3..26289e969c4e00 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -151,6 +151,7 @@ jobs: -DENABLE_CPPLINT=OFF \ -DENABLE_NCC_STYLE=OFF \ -DENABLE_TESTS=ON \ + -DENABLE_WHEEL=OFF \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DENABLE_STRICT_DEPENDENCIES=OFF \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ @@ -168,7 +169,6 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | @@ -179,6 +179,48 @@ jobs: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd + # Setup additional Python versions for wheels building + - name: Setup Python 3.9 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.9" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.10 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.10" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.12 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.12" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Build additional Python wheels + run: | + for py_version in "3.9" "3.10" "3.11" "3.12" + do + python_exec_path=$(python$py_version -c "import sys; print(sys.executable)") + $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt + + cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version + cmake --build ${{ github.workspace }}/py$py_version --parallel + cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels + done + + # Setup Python 3.11 as the default one + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ env.PYTHON_VERSION }} + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + - name: Cmake & Build - OpenVINO Contrib run: | cmake \ @@ -199,6 +241,7 @@ jobs: cmake --build ${{ env.BUILD_DIR }} --parallel $(nproc) cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR_JS }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake + # # Upload build artifacts # @@ -210,7 +253,7 @@ jobs: name: openvino_package path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz if-no-files-found: 'error' - + - name: Upload openvino wheels uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: @@ -270,6 +313,19 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'mac_13' + Python_API_Tests: + name: Python API tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + python-version: [ '3.9', '3.10', '3.11', '3.12' ] + with: + runner: 'macos-13' + python-version: ${{ matrix.python-version }} + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + Python_Unit_Tests: name: Python unit tests needs: [ Build, Smart_CI ] diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 0708a844fe6b8b..d3fb10082adfd4 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -151,6 +151,7 @@ jobs: -DENABLE_CPPLINT=OFF \ -DENABLE_NCC_STYLE=OFF \ -DENABLE_TESTS=ON \ + -DENABLE_WHEEL=OFF \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DENABLE_STRICT_DEPENDENCIES=OFF \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ @@ -168,7 +169,6 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | @@ -180,6 +180,48 @@ jobs: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd + # Setup additional Python versions for wheels building + - name: Setup Python 3.9 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.9" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.10 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.10" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.12 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.12" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Build additional Python wheels + run: | + for py_version in "3.9" "3.10" "3.11" "3.12" + do + python_exec_path=$(python$py_version -c "import sys; print(sys.executable)") + $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt + + cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version + cmake --build ${{ github.workspace }}/py$py_version --parallel + cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels + done + + # Setup Python 3.11 as the default one + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ env.PYTHON_VERSION }} + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + - name: Cmake & Build - OpenVINO Contrib run: | cmake \ @@ -279,6 +321,19 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + python-version: [ '3.9', '3.10', '3.11', '3.12' ] + with: + runner: 'macos-13-xlarge' + python-version: ${{ matrix.python-version }} + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Build, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index f4caec8b2458a0..4fc93d73213f78 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -300,6 +300,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' + python-version: '3.11' + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index d874e06a189232..1ad3951ecd3347 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -134,6 +134,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + python-version: '3.12' + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + Pytorch_Layer_Tests: name: Pytorch Layer Tests needs: [ Docker, Build, Smart_CI ] From 67f253764c4d0a9b7ab5a8f9706d063e488d7b5b Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 9 Dec 2024 19:27:32 +0100 Subject: [PATCH 18/23] [GHA][ov-provider] Exclude custom release packages from matching (#27979) To filter out automatically picking unwanted custom release builds like https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.5/windows_vc_mt Test run: https://github.com/openvinotoolkit/openvino_tokenizers/actions/runs/12237578864/job/34133648815?pr=338 (now the regular "windows" package is picked) Signed-off-by: Alina Kladieva --- .github/actions/openvino_provider/get_s3_package.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/actions/openvino_provider/get_s3_package.py b/.github/actions/openvino_provider/get_s3_package.py index df253a422421ec..02ea99cb2f3403 100644 --- a/.github/actions/openvino_provider/get_s3_package.py +++ b/.github/actions/openvino_provider/get_s3_package.py @@ -54,6 +54,10 @@ def main(product, version_pattern, platform, arch, folder): matching_files = filter_files_by_criteria(all_files, product, version_pattern, platform, arch, folder) if matching_files: logger.info(f"Matching packages: {sorted(matching_files)}") + if len(matching_files) > 1: + custom_release_build_pattern = fr".*/{version_pattern}/(linux_|windows_|macos_).*/.*" + # Exclude custom release builds, if any, from matches + matching_files = [file for file in matching_files if not re.search(custom_release_build_pattern, file)] package_url = f"https://storage.openvinotoolkit.org{sorted(matching_files)[-1]}" logger.info(f"Returning package URL: {package_url}") action_utils.set_github_output("package_url", package_url) From f0da7075169b97f6523d8f465cbb6ab76f995324 Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Tue, 10 Dec 2024 08:42:26 +0100 Subject: [PATCH 19/23] [tests/requirements_pytorch] Temporarily fix optimum-intel version on last stable commit (#27985) There are failures with newer commits, e.g. https://github.com/openvinotoolkit/openvino/actions/runs/12240792041/job/34146426674 --------- Signed-off-by: Alina Kladieva --- .github/components.yml | 1 + tests/requirements_pytorch | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/components.yml b/.github/components.yml index 8de51a2ced3343..74247e1f051cd5 100644 --- a/.github/components.yml +++ b/.github/components.yml @@ -149,6 +149,7 @@ PyTorch_FE: build: - CPU - Python_API + - TOKENIZERS # PyTorch_FE tests depend on tokenizers build JAX_FE: revalidate: diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index be304155e2afc0..f42deb81839883 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -44,7 +44,7 @@ super-image==0.1.7 huggingface-hub==0.25.2 # use latest released version once it's available -git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12" +git+https://github.com/huggingface/optimum-intel.git@5c735487d4bd3dd8d7dccb242d8d5988e7dd4069; python_version < "3.12" # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer hf_transfer==0.1.8 From 9e6dfed16a29ddfcddba78f2d1b895d647cd2ec9 Mon Sep 17 00:00:00 2001 From: Pavel Durandin Date: Tue, 10 Dec 2024 11:10:12 +0400 Subject: [PATCH 20/23] [GPU] Fix tests errors, phase 7 (#27953) ### Details: - Fixes in unit tests failures --- .../tests/unit/test_cases/convolution_gpu_test.cpp | 7 ++++--- .../intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp | 2 +- .../intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index 5d01d448dcfc64..f0243f055c3670 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -10784,11 +10784,12 @@ TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) { auto is_weight_1x1 = (p.wei_shape[p.wei_shape.size() - 1] == 1 && p.wei_shape[p.wei_shape.size() - 2] == 1); auto is_valid_output = p.wei_shape[0] % 16 == 0; - auto is_valid_strid = p.stride[0] == 1 && p.stride[1] == 1; - auto is_valid_padding = p.pad_begin[0] == 0 && p.pad_begin[1] == 0 && p.pad_end[0] == 0 && p.pad_end[1] == 0; + auto is_valid_strid = std::all_of(p.stride.begin(), p.stride.end(), [](size_t i) { return i == 1; }); + auto is_valid_padding = std::all_of(p.pad_begin.begin(), p.pad_begin.end(), [](int i) { return i == 0; }) + && std::all_of(p.pad_end.begin(), p.pad_end.end(), [](int i) { return i == 0; }); if (!is_weight_1x1 || !is_valid_output || !is_valid_strid || !is_valid_padding) { - std::cout << "[ SKIPPED ] The test is skipped (is_weight_1x1:" << is_weight_1x1 << ", is_valid_output" << is_valid_output + std::cout << "[ SKIPPED ] The test is skipped (is_weight_1x1: " << is_weight_1x1 << ", is_valid_output: " << is_valid_output << ", is_valid_strid: " << is_valid_strid << ", is_valid_padding: " << is_valid_padding << std::endl; ASSERT_EQ(1, 1); return; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp index 3099c8dad5d9d3..5d78cdec028724 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp @@ -1963,7 +1963,7 @@ const std::vector IRDFT_params_4d = { {{2, 10, 6, 2}, {2, 10, 10}, {1, 2}, {}, expected_rdft2d_results, rinput_data}, {{2, 10, 6, 2}, {2, 10, 10}, {1, 2}, {10, 10}, expected_rdft2d_results, rinput_data}, {{2, 5, 7, 2}, {2, 5, 12}, {1, 2}, {5, 12}, expected_rdft2d_results_2, expected_irdft2d_results_2}, - {{2, 10, 6, 2}, {2, 10, 10}, {0, 1, 2}, {10, 10}, expected_rdft3d_results, rinput_data}, + {{2, 10, 6, 2}, {2, 10, 10}, {0, 1, 2}, {10, 10, 10}, expected_rdft3d_results, rinput_data}, {{2, 10, 6, 2}, {4, 5, 12}, {0, 1, 2}, {4, 5, 12}, expected_rdft3d_results, expected_irdft3d_results_2}, }; const std::vector IRDFT_params_5d = extendByOneDimension(IRDFT_params_4d); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp index 461474335e903a..324f90faf0b70e 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp @@ -1244,7 +1244,7 @@ static void generic_average_wo_padding_test(format fmt, tensor output, tensor in tpl.add(reorder("reorder", input_info("in"), input_mem->get_layout().with_padding((padding) off.sizes()))); pool_in = "reorder"; } - tpl.add(pooling("pool", input_info(pool_in), pooling_mode::average_no_padding, window, stride, offset)); + tpl.add(pooling("pool", input_info(pool_in), pooling_mode::average_no_padding, window, stride, offset, offset)); auto cfg = get_test_default_config(get_test_engine()); cfg.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"pool", {format::any, "", impl_types::ocl}}})); From 6a4ba4695191b14c215e4613b5327707c0e33008 Mon Sep 17 00:00:00 2001 From: Egor Duplenskii Date: Tue, 10 Dec 2024 08:11:22 +0100 Subject: [PATCH 21/23] [CPU] Introduce FullyConnected, FCQuantized, FCCompressed, Placeholder (#26239) ### Details: 1. Introduce the following operations to the internal opset * `FullyConnected` (`MatMul` with transposed constant second input) * `FullyConnectedCompressed` (`FullyConnected` with weights compression) * `FullyConnectedQuantizedLegacy` (`FullyConnected` with quantized activations and weights and dequantize scale and zero point pulled through the Op by LPT) * `FullyConnectedQuantized` (`FullyConnected` with quantization scales and zero points on activation, weights and outputs). Planned to be used in scope of dynamic quantization. Can be used for a static quantization as well in the future. * Unused inputs are presented as `Constant` input with `Shape{0}` 2. The following transformations were added / updated: * `ConvertFullyConnectedToFullyConnectedCompressed` (replaces proprietary ~`FuseFCAndWeightsDecompression`~) * `ConvertFCToFCQuantizedLegacy` replaces proprietary ~`FuseConvMatmulFCDeconvAndDQScales`~ * `FullyConnectedBiasFusion` (added into CPU folder for now, needs to be checked and review by GPU team before adaptation to internal opset). Replaces proprietary ~`FuseConvolutionMatMulDeconvAndBias`~ * `ConvertMatMulToFC` updated to use `ov::op::internal:FullyConnected`, planned to be moved to internal opset after review from GPU team ### Todo - [x] Clean up debug code - [x] Clean up extra cmake targets - [x] Perf regression check ### Tickets: - 149923 --- .../include/ov_ops/fully_connected.hpp | 46 +++ .../ov_ops/fully_connected_compressed.hpp | 41 +++ .../ov_ops/fully_connected_quantized.hpp | 39 +++ .../fully_connected_quantized_legacy.hpp | 41 +++ .../convert_fc_to_compressed.hpp | 29 ++ .../convert_fc_to_quantized_legacy.hpp | 22 ++ .../src/ov_ops/fully_connected.cpp | 62 ++++ .../src/ov_ops/fully_connected_compressed.cpp | 63 ++++ .../src/ov_ops/fully_connected_quantized.cpp | 59 ++++ .../fully_connected_quantized_legacy.cpp | 71 +++++ .../convert_fc_to_compressed.cpp | 181 +++++++++++ .../convert_fc_to_quantized_legacy.cpp | 77 +++++ src/frontends/ir/src/ir_deserializer.cpp | 5 +- src/plugins/intel_cpu/src/cpu_types.cpp | 7 + src/plugins/intel_cpu/src/cpu_types.h | 6 + .../intel_cpu/src/dnnl_postops_composer.cpp | 105 +++++-- .../intel_cpu/src/dnnl_postops_composer.h | 3 +- src/plugins/intel_cpu/src/edge.cpp | 8 + src/plugins/intel_cpu/src/extension.cpp | 10 +- src/plugins/intel_cpu/src/graph_optimizer.cpp | 262 +--------------- src/plugins/intel_cpu/src/graph_optimizer.h | 1 - .../src/memory_desc/empty_memory_desc.h | 4 +- src/plugins/intel_cpu/src/node.cpp | 3 +- .../executors/acl/acl_fullyconnected.cpp | 24 +- .../dnnl/dnnl_convolution_primitive.cpp | 3 +- .../dnnl/dnnl_fullyconnected_primitive.cpp | 60 ++-- .../dnnl/dnnl_fullyconnected_primitive.hpp | 7 - .../executors/dnnl/dnnl_matmul_primitive.cpp | 10 +- .../src/nodes/executors/executor_config.hpp | 1 - .../src/nodes/executors/executor_factory.hpp | 1 - .../nodes/executors/fullyconnected_config.hpp | 7 +- .../fullyconnected_implementations.cpp | 3 +- .../src/nodes/executors/matmul_config.hpp | 1 - .../src/nodes/executors/memory_arguments.hpp | 8 +- .../src/nodes/executors/mlas/mlas_gemm.cpp | 43 +-- .../intel_cpu/src/nodes/fullyconnected.cpp | 237 +++++++++------ .../intel_cpu/src/nodes/fullyconnected.h | 34 ++- src/plugins/intel_cpu/src/nodes/input.cpp | 50 ++-- src/plugins/intel_cpu/src/nodes/input.h | 2 +- src/plugins/intel_cpu/src/nodes/reference.cpp | 2 +- .../shape_inference/custom/fullyconnected.cpp | 4 +- .../cpu_opset/common/op/fully_connected.cpp | 79 ----- .../cpu_opset/common/op/fully_connected.hpp | 39 --- .../common/pass/convert_matmul_to_fc.cpp | 32 +- .../common/pass/convert_matmul_to_fc.hpp | 2 +- .../common/pass/convert_to_power_static.cpp | 22 +- .../cpu_opset/common/pass/fc_bias_fusion.cpp | 79 +++++ .../cpu_opset/common/pass/fc_bias_fusion.hpp | 19 ++ .../pass/move_fc_reshape_to_weights.cpp | 5 +- .../cpu_opset/common/pass/split_fc.cpp | 207 ------------- .../cpu_opset/common/pass/split_fc.hpp | 81 ----- .../convert_to_cpu_specific_opset.hpp | 43 ++- .../transformation_pipeline.cpp | 3 +- .../intel_cpu/src/transformations/utils.cpp | 4 +- src/plugins/intel_cpu/src/utils/cpu_utils.hpp | 31 ++ .../src/utils/debug_capabilities.cpp | 5 +- .../intel_cpu/src/utils/debug_capabilities.h | 7 + .../instances/arm/matmul.cpp | 3 - .../src/x64/matmul_weights_decompression.cpp | 2 +- .../custom_shape_infer/fullconnect.cpp | 70 ++++- .../transformations/convert_matmul_test.cpp | 231 +++++++++------ .../move_fc_reshape_to_weights.cpp | 9 +- .../unit/transformations/split_fc_test.cpp | 280 ------------------ .../common_test_utils/src/ov_test_utils.cpp | 1 + 64 files changed, 1562 insertions(+), 1334 deletions(-) create mode 100644 src/common/transformations/include/ov_ops/fully_connected.hpp create mode 100644 src/common/transformations/include/ov_ops/fully_connected_compressed.hpp create mode 100644 src/common/transformations/include/ov_ops/fully_connected_quantized.hpp create mode 100644 src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp create mode 100644 src/common/transformations/src/ov_ops/fully_connected.cpp create mode 100644 src/common/transformations/src/ov_ops/fully_connected_compressed.cpp create mode 100644 src/common/transformations/src/ov_ops/fully_connected_quantized.cpp create mode 100644 src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp delete mode 100644 src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp diff --git a/src/common/transformations/include/ov_ops/fully_connected.hpp b/src/common/transformations/include/ov_ops/fully_connected.hpp new file mode 100644 index 00000000000000..6f33b5963ffaf8 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnected : public ov::op::Op { +public: + OPENVINO_OP("FullyConnected", "ie_internal_opset"); + + FullyConnected() = default; + + FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::element::Type output_type = ov::element::undefined); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + ov::element::Type get_output_type() const { + return m_output_type; + } + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void validate_and_infer_types() override; + +protected: + ov::element::Type m_output_type; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp b/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp new file mode 100644 index 00000000000000..d363a339406070 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedCompressed : public FullyConnected { +public: + OPENVINO_OP("FullyConnectedCompressed", "ie_internal_opset", FullyConnected); + + FullyConnectedCompressed() = default; + + FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::element::Type output_type = ov::element::undefined); + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void validate_and_infer_types() override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp new file mode 100644 index 00000000000000..6eceed0abdef78 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedQuantized : public FullyConnected { +public: + OPENVINO_OP("FullyConnectedQuantized", "ie_internal_opset", FullyConnected); + + FullyConnectedQuantized() = default; + + FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::Output& input_zero_points, + const ov::Output& output_scales, + const ov::Output& output_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp new file mode 100644 index 00000000000000..2c68ec4dc365f9 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedQuantizedLegacy : public FullyConnected { +public: + OPENVINO_OP("FullyConnectedQuantizedLegacy", "ie_internal_opset", FullyConnected); + + FullyConnectedQuantizedLegacy() = default; + + FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::Output& deq_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::element::Type output_type = ov::element::undefined); + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void validate_and_infer_types() override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp new file mode 100644 index 00000000000000..1b6fcfb2bb3684 --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "ov_ops/fully_connected.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertFullyConnectedToFullyConnectedCompressed; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertFullyConnectedToFullyConnectedCompressed : public ov::pass::MatcherPass { +public: + using SupportsPredicate = + std::function&, size_t, size_t, size_t)>; + + OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0"); + ConvertFullyConnectedToFullyConnectedCompressed(const std::vector& supported_activation_types, + const std::vector& supported_weights_types, + SupportsPredicate supports_config = nullptr, + bool convert_u4zp_to_u8 = false); +}; diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp new file mode 100644 index 00000000000000..88990f92cb573c --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertFCToFCQuantizedLegacy; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertFCToFCQuantizedLegacy : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedQuantized", "0"); + ConvertFCToFCQuantizedLegacy(); +}; diff --git a/src/common/transformations/src/ov_ops/fully_connected.cpp b/src/common/transformations/src/ov_ops/fully_connected.cpp new file mode 100644 index 00000000000000..3fa609362b999c --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected.hpp" + +#include + +#include "matmul_shape_inference.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnected::FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::element::Type output_type) + : Op({A, B, bias}), + m_output_type(output_type) { + validate_and_infer_types(); +} + +FullyConnected::FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::element::Type output_type) + : FullyConnected(A, B, std::make_shared(element::undefined, Shape{0}), output_type) {} + +bool FullyConnected::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("output_type", m_output_type); + return true; +} + +std::shared_ptr FullyConnected::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_output_type); +} + +void FullyConnected::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, + input_size >= 3, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected at least 3."); + + ov::op::v0::MatMul op; + op.set_transpose_a(false); + op.set_transpose_b(true); + + auto out_shapes = + ov::op::v0::shape_infer(&op, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp b/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp new file mode 100644 index 00000000000000..e0bb13042ea6ff --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_compressed.hpp" + +#include + +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); + validate_and_infer_types(); +} + +FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::element::Type output_type) + : FullyConnectedCompressed(X, + W, + bias, + weight_scales, + std::make_shared(element::undefined, Shape{0}), + output_type) {} + +std::shared_ptr FullyConnectedCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + m_output_type); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedCompressed::validate_and_infer_types() { + const auto input_size = get_input_size(); + + NODE_VALIDATION_CHECK(this, input_size == 5, "Number of inputs is incorrect. Current value is: ", input_size); + + FullyConnected::validate_and_infer_types(); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp new file mode 100644 index 00000000000000..3f06e14834f7d1 --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_quantized.hpp" + +#include "openvino/core/type/element_type.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::Output& input_zero_points, + const ov::Output& output_scales, + const ov::Output& output_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); + set_argument(5, input_scales); + set_argument(6, input_zero_points); + set_argument(7, output_scales); + set_argument(8, output_zero_points); + validate_and_infer_types(); +} + +std::shared_ptr FullyConnectedQuantized::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(5), + new_args.at(6), + new_args.at(7), + new_args.at(8), + m_output_type); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedQuantized::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, input_size == 9, "Number of inputs is incorrect. Current value is: ", input_size); + + FullyConnected::validate_and_infer_types(); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp new file mode 100644 index 00000000000000..42df0980086199 --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_quantized_legacy.hpp" + +#include + +#include "matmul_shape_inference.hpp" +#include "openvino/core/type/element_type.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedQuantizedLegacy::FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::Output& deq_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, deq_scales); + set_argument(4, deq_zero_points); + validate_and_infer_types(); +} + +FullyConnectedQuantizedLegacy::FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::element::Type output_type) + : FullyConnectedQuantizedLegacy(X, + W, + bias, + deq_scales, + std::make_shared(element::undefined, Shape{0}), + output_type) {} + +std::shared_ptr FullyConnectedQuantizedLegacy::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + m_output_type); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedQuantizedLegacy::validate_and_infer_types() { + const auto input_size = get_input_size(); + + NODE_VALIDATION_CHECK(this, input_size == 5, "Number of inputs is incorrect. Current value is: ", input_size); + + ov::op::v0::MatMul op; + op.set_transpose_a(false); + op.set_transpose_b(true); + + auto out_shapes = + ov::op::v0::shape_infer(&op, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp new file mode 100644 index 00000000000000..87c3b669d98c6d --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp @@ -0,0 +1,181 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_fc_to_compressed.hpp" + +#include +#include + +#include "openvino/core/rt_info.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed( + const std::vector& supported_activation_types, + const std::vector& supported_weights_types, + SupportsPredicate supports_config, + bool convert_u4zp_to_u8) { + using namespace ov::pass::pattern; + + auto reshape_3d_to_2d = [](const ov::Output& output) { + auto in_ps = output.get_node()->get_input_partial_shape(0); + auto out_ps = output.get_node()->get_output_partial_shape(0); + return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2; + }; + + auto activation_m = any_input(ov::pass::pattern::type_matches_any(supported_activation_types)); + auto weights_m = wrap_type(ov::pass::pattern::type_matches_any(supported_weights_types)); + auto convert_m = wrap_type({weights_m}); + + auto sub_const_m = wrap_type(); + auto sub_convert_const_m = wrap_type({sub_const_m}); + auto sub_with_convert_m = wrap_type({convert_m, sub_convert_const_m}); + auto sub_no_convert_m = wrap_type({convert_m, sub_const_m}); + auto subtract_m = std::make_shared(OutputVector{sub_with_convert_m, sub_no_convert_m}); + + auto mul_const_m = wrap_type(); + auto mul_convert_const_m = wrap_type({mul_const_m}); + auto mul_scale_m = std::make_shared(OutputVector{mul_const_m, mul_convert_const_m}); + + auto mul_with_sub_m = wrap_type({subtract_m, mul_scale_m}); + auto mul_no_sub_m = wrap_type({convert_m, mul_scale_m}); + auto mul_m = std::make_shared(OutputVector{mul_with_sub_m, mul_no_sub_m}); + + auto reshape_const_m = wrap_type(); + auto reshape_m = wrap_type({mul_m, reshape_const_m}, reshape_3d_to_2d); + + auto transpose_input = std::make_shared(OutputVector{reshape_m, mul_m}); + auto transpose_const_m = wrap_type(); + auto transpose_m = wrap_type({transpose_input, transpose_const_m}); + + auto bias_m = any_input(); + auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m, mul_m}); + auto fully_connected_m = wrap_type({activation_m, weights_input_m, bias_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + OPENVINO_ASSERT(pattern_map.count(fully_connected_m)); + OPENVINO_ASSERT(pattern_map.count(mul_const_m)); + OPENVINO_ASSERT(pattern_map.count(weights_m)); + OPENVINO_ASSERT(pattern_map.count(bias_m)); + OPENVINO_ASSERT(pattern_map.count(convert_m)); + auto fc = std::dynamic_pointer_cast( + pattern_map.at(fully_connected_m).get_node_shared_ptr()); + if (!fc || transformation_callback(fc)) { + return false; + } + + bool has_transpose = pattern_map.count(transpose_m); + auto scale_shape = pattern_map.at(mul_const_m).get_shape(); + bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { + return d > 1; + }) > 1; + + auto weights_shape = fc->get_input_shape(1); + const size_t IC = *(weights_shape.rbegin()); + const size_t OC = *(weights_shape.rbegin() + 1); + + const size_t G = grouped ? (has_transpose ? *(scale_shape.rbegin() + 2) : *(scale_shape.rbegin() + 1)) : 1; + + if (supports_config && !supports_config(fc, IC, OC, G)) + return false; + + auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr node) { + auto constant = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(constant != nullptr); + ov::Shape current_shape = constant->get_shape(); + if (current_shape.size() <= 2) + return constant; + + OPENVINO_ASSERT(current_shape.size() == 3); + + auto new_shape = (has_transpose || !grouped) + ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]} + : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]}; + + return std::make_shared(*constant, new_shape); + }; + + auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr node) -> std::shared_ptr { + auto constant = std::dynamic_pointer_cast(node); + if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8) + return std::dynamic_pointer_cast(constant); + return std::make_shared(node, ov::element::u8); + }; + + const ov::Output& fc_input_a = fc->input_value(0); + const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr()); + std::shared_ptr optional_zero_point = nullptr; + + const bool with_zero_point = + pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0; + if (with_zero_point) { + // WA: Convert ZP to u8 for OneDNN case to avoid u4 reorder + optional_zero_point = + convert_u4const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr())); + } + + std::shared_ptr fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr()); + std::shared_ptr fc_input_scale = scale; + std::shared_ptr fc_input_zp = optional_zero_point; + std::shared_ptr fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr(); + std::vector> result_nodes = {}; + if (has_transpose) { + const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr(); + std::shared_ptr transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr(); + if (ov::shape_size(transpose_const->get_shape()) != fc_input_b->get_output_partial_shape(0).size()) { + std::vector new_order(fc_input_b->get_output_partial_shape(0).size()); + std::iota(new_order.begin(), new_order.end(), 0); + std::swap(new_order[new_order.size() - 1], new_order[new_order.size() - 2]); + transpose_const = + std::make_shared(ov::element::i32, ov::Shape{new_order.size()}, new_order); + } + + fc_input_b = transpose->clone_with_new_inputs({fc_input_b->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_b); + result_nodes.push_back(fc_input_b); + fc_input_scale = transpose->clone_with_new_inputs({scale->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_scale); + result_nodes.push_back(fc_input_scale); + if (with_zero_point && ov::shape_size(optional_zero_point->output(0).get_shape()) > 1) { + fc_input_zp = transpose->clone_with_new_inputs({optional_zero_point->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_zp); + result_nodes.push_back(fc_input_zp); + } + } + + fc_input_zp = + with_zero_point ? fc_input_zp : std::make_shared(element::undefined, Shape{0}); + ov::disable_constant_folding(fc_input_zp); + result_nodes.push_back(fc_input_zp); + + auto new_fc = std::make_shared(fc_input_a, + fc_input_b, + fc_input_bias, + fc_input_scale, + fc_input_zp, + fc->get_output_type()); + + result_nodes.push_back(new_fc); + new_fc->set_friendly_name(fc->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), result_nodes); + ov::replace_node(fc, new_fc); + return true; + }; + + auto m = std::make_shared(fully_connected_m, + "ConvertFullyConnectedToFullyConnectedCompressed"); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp new file mode 100644 index 00000000000000..908e36a51a7eb9 --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_fc_to_quantized_legacy.hpp" + +#include + +#include "openvino/core/rt_info.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertFCToFCQuantizedLegacy::ConvertFCToFCQuantizedLegacy() { + using namespace ov::pass::pattern; + + std::vector activation_types{ov::element::u8, ov::element::i8}; + std::vector weights_types{ov::element::i8}; + + auto activations_m = pattern::any_input(ov::pass::pattern::type_matches_any(activation_types)); + auto weights_m = wrap_type(ov::pass::pattern::type_matches_any(weights_types)); + auto bias_m = pattern::any_input(); + + auto fully_connected_m = wrap_type({activations_m, weights_m, bias_m}); + auto dequantization_scales_m = wrap_type(); + auto multiply_m = wrap_type({fully_connected_m, dequantization_scales_m}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto fc_output = pattern_map.at(fully_connected_m); + auto activations = pattern_map.at(activations_m); + auto weights = pattern_map.at(weights_m); + auto bias = pattern_map.at(bias_m); + auto multiply = pattern_map.at(multiply_m); + auto dequantization_scales = pattern_map.at(dequantization_scales_m); + const auto& fc_output_shape = fc_output.get_partial_shape(); + const auto& multiply_output_shape = multiply.get_partial_shape(); + + if (*fc_output_shape.rbegin() != *multiply_output_shape.rbegin()) { + return false; + } + + auto fc_node = std::dynamic_pointer_cast( + pattern_map.at(fully_connected_m).get_node_shared_ptr()); + + ov::NodeVector new_ops; + auto zp = std::make_shared(element::undefined, Shape{0}); + new_ops.push_back(zp); + + auto fc_quantized = + std::make_shared(activations, + weights, + bias, + dequantization_scales, + zp, + fc_node->get_output_type()); + new_ops.push_back(fc_quantized); + + const auto& multiply_node = multiply.get_node_shared_ptr(); + fc_quantized->set_friendly_name(multiply_node->get_friendly_name()); + + ov::copy_runtime_info({multiply_node, fc_node}, new_ops); + ov::replace_node(multiply_node, fc_quantized); + + return true; + }; + + auto m = std::make_shared(multiply_m, "ConvertFullyConnectedToFullyConnectedQuantized"); + this->register_matcher(m, callback); +} diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp index 7c8b6e9d4b97ab..2d1dfba956ea72 100644 --- a/src/frontends/ir/src/ir_deserializer.cpp +++ b/src/frontends/ir/src/ir_deserializer.cpp @@ -10,6 +10,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/meta_data.hpp" #include "openvino/core/rt_info/weightless_caching_attributes.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/loop.hpp" @@ -831,7 +832,9 @@ std::shared_ptr ov::XmlDeserializer::create_node(const std::vector(inputs[i].get_node_shared_ptr()) && + ov::element::Type_t::undefined == inputs[i].get_element_type()) OPENVINO_THROW(params.type, " layer ", params.name, diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 3b6440e56c3272..30884bbe649962 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -41,6 +41,9 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"GroupConvolution", Type::Convolution}, {"MatMul", Type::MatMul}, {"FullyConnected", Type::FullyConnected}, + {"FullyConnectedCompressed", Type::FullyConnected}, + {"FullyConnectedQuantizedLegacy", Type::FullyConnected}, + {"FullyConnectedQuantized", Type::FullyConnected}, {"MaxPool", Type::Pooling}, {"AvgPool", Type::Pooling}, {"AdaptiveMaxPool", Type::AdaptivePooling}, @@ -469,6 +472,10 @@ std::string algToString(const Algorithm alg) { CASE(FQCommon); CASE(FQQuantization); CASE(FQBinarization); + CASE(FullyConnectedCommon); + CASE(FullyConnectedCompressed); + CASE(FullyConnectedQuantized); + CASE(FullyConnectedQuantizedLegacy); CASE(ROIPoolingMax); CASE(ROIPoolingBilinear); CASE(ROIAlignMax); diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index 9461526184b0bf..71088c22af8336 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -213,6 +213,12 @@ enum class Algorithm { EltwiseBitwiseLeftShift, EltwiseBitwiseRightShift, + // FullyConnected algorithms + FullyConnectedCommon, + FullyConnectedCompressed, + FullyConnectedQuantized, + FullyConnectedQuantizedLegacy, + // FakeQuantize algorithms FQCommon, FQQuantization, diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 2f82fbe553ae19..70d28f1f4ac739 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -11,21 +11,69 @@ #include #include +#include "cpu_types.h" #include "memory_desc/dnnl_blocked_memory_desc.h" +#include "nodes/executors/memory_arguments.hpp" #include "openvino/core/type/element_type.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { +static std::vector getDeQuantizedScales(const MemoryArgs& memory) { + if (!memory.count(ARG_DST_DEQ_SCALE)) + return {}; + + auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE); + + auto scalesData = static_cast(scalesMemory->getData()); + + if (!scalesData) + return {}; + + auto dstShape = memory.at(ARG_DST)->getShape(); + auto dqScalesShape = scalesMemory->getShape(); + + auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size()); + + auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies()); + + std::vector DQScales(scaleSize, 1.0); + + OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize, + "set invalid scales size , DQScales vector size: ", + DQScales.size(), + ", scale data size: ", + scaleSize); + + // @todo do we really need to broadcast dq scales and then resize them back? + if (scaleSize > DQScales.size()) + DQScales.resize(scaleSize, DQScales[0]); + if (1 == scaleSize) { + std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) { + return (scalesData[0] * val); + }); + } else { + for (size_t i = 0; i < DQScales.size(); i++) { + DQScales[i] *= scalesData[i]; + } + } + if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) { + return (val == DQScales[0]); + })) + DQScales.resize(1); + + return DQScales; +} + DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, const dnnl::engine& engine, const VectorDims& outputDims, const size_t indexOfOutputChannelDim, const bool isInt8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType) : engine(engine), postOps(postOps), @@ -39,6 +87,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, dimsPerOC = dimsPerTensor = VectorDims(outputDims.size(), 1); dimsPerOC[idxOC] = OC; + const auto& DQScales = getDeQuantizedScales(memory); // generalise dq scales, so extra logic is necessary here. if (isINT8) { wei_scale_values = DQScales.empty() ? std::vector{1.0} : DQScales; @@ -49,6 +98,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, updateWeiScales(); // If having the bias, attr weight scale can't be updated for further ops-ops optimization. // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. + const bool hasBias = !memory.at(ARG_BIAS)->getDesc().empty(); weightScaleAvailable = !hasBias; } else if (!DQScales.empty()) { // DQ scale is fused but swiching back to non-INT8 for execution in some cases. @@ -325,9 +375,9 @@ static OptimizedFormula updateOptimizedFormula(const FakeQuantizePostOp& postOp, } bool DnnlPostOpsComposer::appendAttrPostOps(const FakeQuantizePostOp& postOp, - bool isLastPostOp, - bool doRounding, - bool allowBinary) { + bool isLastPostOp, + bool doRounding, + bool allowBinary) { DEBUG_LOG("isLastPostOp=", isLastPostOp, ", outDataType=", @@ -541,9 +591,9 @@ bool DnnlPostOpsComposer::appendShift(const std::vector& shift, bool allo } bool DnnlPostOpsComposer::appendLinear(const std::vector& scale, - const std::vector& shift, - bool isLastPostOp, - bool allowBinary) { + const std::vector& shift, + bool isLastPostOp, + bool allowBinary) { if (scale.size() == 1 && shift.size() == 1) { if (shift[0] == 0.0f) return appendScale(scale, isLastPostOp, allowBinary); @@ -599,15 +649,27 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, if (shape.size() == 1 && shape[0] == 1) { shape.push_back(1); } + if (shape.size() != 2 && shape.size() != 3) - OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); + OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); - Shape dstShape = needTranspose ? Shape({shape[0], shape[1]}) : Shape({shape[shape.size() - 1], shape[0]}); - DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(dstPrc), dnnl::memory::format_tag::io); - auto dstMem = std::make_shared(engine, dstMemoryDesc); + // weights without batch: (OC, G) + // weights with batch: (B, OC, G) + const size_t OC = shape[shape.size() - 2]; + const size_t G = shape[shape.size() - 1]; + + Shape dstShape = Shape({OC, G}); + DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, + DnnlExtensionUtils::ElementTypeToDataType(dstPrc), + dnnl::memory::format_tag::io); + auto dstMem = std::make_shared(engine, dstMemoryDesc); auto srcFormat = needTranspose ? dnnl::memory::format_tag::oi : dnnl::memory::format_tag::io; - DnnlBlockedMemoryDesc srcMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), srcFormat); + + DnnlBlockedMemoryDesc srcMemoryDesc( + dstShape, + DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), + srcFormat); auto srcMem = std::make_shared(engine, srcMemoryDesc, paramsPtr->getData()); dstMem->load(*srcMem); @@ -615,25 +677,32 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, return dstMem; } -void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (scales_ptr == nullptr) return; auto scalesMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine); attr.set_scales_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scalesMem); dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive(); } -void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (zero_points_ptr == nullptr) return; - auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine); + auto zeroPointsMem = + prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine); attr.set_zero_points_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem; dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive(); } diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h index c07ec0f608b6db..8c2718aaaed4d5 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h @@ -27,8 +27,7 @@ class DnnlPostOpsComposer { const size_t indexOfOutputChannelDim, const bool isINT8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType); DnnlPrimitiveAttrs compose(); void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision); diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index 82bde8edae2b4a..c49b924477f694 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -5,6 +5,7 @@ #include "edge.h" #include "node.h" #include "dnnl_extension_utils.h" +#include "openvino/core/type/element_type.hpp" #include "openvino/util/pp.hpp" using namespace dnnl; @@ -212,6 +213,10 @@ Edge::ReorderStatus Edge::needReorder() { bool optimized = false; auto inputPortDesc = getInputPortDesc(); auto outPortDesc = getOutputPortDesc(); + + if (inputPortDesc->getMemDesc()->getPrecision() == element::undefined) + return ReorderStatus::No; + // Check whether the child node may accept the parent produced tensor if (!outPortDesc->isCompatible(*inputPortDesc)) { // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be read using different descriptors @@ -410,6 +415,9 @@ const MemoryDesc& Edge::getOutputDesc() const { } const MemoryDesc& Edge::getDesc() const { + if (getInputDesc().getPrecision() == element::undefined) + return getInputDesc(); + if (!getInputDesc().isCompatible(getOutputDesc())) OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName()); diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index a29282d4af3101..e6dbc04b0ca6a4 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -7,6 +7,10 @@ #include "openvino/core/op_extension.hpp" #include "ov_ops/augru_cell.hpp" #include "ov_ops/augru_sequence.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" +#include "ov_ops/fully_connected_quantized.hpp" #include "ov_ops/gather_compressed.hpp" #include "ov_ops/multiclass_nms_ie_internal.hpp" #include "ov_ops/nms_ie_internal.hpp" @@ -16,7 +20,6 @@ #include "ov_ops/type_relaxed.hpp" #include "snippets/op/subgraph.hpp" #include "transformations/cpu_opset/common/op/causal_mask_preprocess.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" #include "transformations/cpu_opset/common/op/leaky_relu.hpp" #include "transformations/cpu_opset/common/op/ngram.hpp" #include "transformations/cpu_opset/common/op/power_static.hpp" @@ -70,7 +73,6 @@ class TypeRelaxedExtension : public ov::OpExtension> { #endif #define CPU_EXTENSIONS \ - OP_EXTENSION(ov::intel_cpu::FullyConnectedNode) \ OP_EXTENSION(ov::intel_cpu::LeakyReluNode) \ OP_EXTENSION(ov::intel_cpu::PowerStaticNode) \ OP_EXTENSION(ov::intel_cpu::CausalMaskPreprocessNode) \ @@ -85,6 +87,10 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::op::internal::NmsStaticShapeIE) \ OP_EXTENSION(ov::op::internal::RMS) \ OP_EXTENSION(ov::op::internal::RoPE) \ + OP_EXTENSION(ov::op::internal::FullyConnected) \ + OP_EXTENSION(ov::op::internal::FullyConnectedCompressed) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantizedLegacy) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantized) \ OP_EXTENSION_X64(ov::intel_cpu::MHANode) \ OP_EXTENSION_X64(ov::intel_cpu::InteractionNode) \ OP_EXTENSION_X64(ov::intel_cpu::LLMMLPNode) \ diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 61590b8691f4b2..94f54fc4c59b55 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -67,10 +67,6 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseConvMatmulFCDeconvAndDQScales(graph); graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression"); - FuseFCAndWeightsDecompression(graph); - graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias"); FuseConvolutionMatMulDeconvAndBias(graph); graph.RemoveDroppedNodes(); @@ -217,8 +213,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { auto scaleNode = node->getParentEdgeAt(1)->getParent(); if (!(parentNode->getType() == Type::Convolution || parentNode->getType() == Type::MatMul - || parentNode->getType() == Type::Deconvolution - || parentNode->getType() == Type::FullyConnected)) + || parentNode->getType() == Type::Deconvolution)) return false; if (!scaleNode->isConstant()) return false; @@ -292,257 +287,6 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { } } -void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) { - std::set supportedWeightsPrecisions{ - ov::element::u8, ov::element::i8, ov::element::nf4, ov::element::u4, ov::element::i4, ov::element::f4e2m1}; - const std::set supportedDataPrecisions{ov::element::f32, ov::element::bf16}; - auto expectedNode = [](NodePtr node, Type expectedType) { - return node->getType() == expectedType && node->getChildEdges().size() == 1; - }; - -#define SKIP_FUSION_FOR_NODE(node) \ - DEBUG_LOG("FuseFCAndWeightsDecompression can't be applied for node ", node->getName()); \ - continue - - if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2)) - return; - - auto& graphNodes = graph.GetNodes(); - for (size_t i = 0; i < graphNodes.size(); i++) { - const auto fcNode = std::dynamic_pointer_cast(graphNodes[i]); - if (fcNode == nullptr) - continue; - - auto parent = fcNode->getParentEdgeAt(1)->getParent(); - const bool withTranspose = parent->getType() == Type::Transpose; - const NodePtr transposeNode = withTranspose ? parent : nullptr; - if (transposeNode) - parent = transposeNode->getParentEdgeAt(0)->getParent(); - // Compressed weights can be shared between several FC layers - const bool is_shared_decompression = parent->getChildEdges().size() > 1; - - const bool withReshape = parent->getType() == Type::Reshape; - const auto reshapeNode = withReshape ? parent : nullptr; - if (reshapeNode) { - parent = reshapeNode->getParentEdgeAt(0)->getParent(); - } - - const auto multiplyNode = parent; - if (multiplyNode->getType() != Type::Eltwise || multiplyNode->getAlgorithm() != Algorithm::EltwiseMultiply || - !multiplyNode->isConstant()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - CPU_GRAPH_OPTIMIZER_SCOPE(FuseFCAndWeightsDecompression); - const auto mulParent1 = multiplyNode->getParentEdgeAt(1)->getParent(); - NodePtr multiplyParent, multiplyConvertNode, multiplyConstNode; - multiplyParent = mulParent1; - if (multiplyParent->getType() == Type::Convert) { - multiplyConvertNode = multiplyParent; - multiplyParent = multiplyConvertNode->getParentEdgeAt(0)->getParent(); - } - multiplyConstNode = multiplyParent; - if (multiplyConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const bool withMultiplyConvert = multiplyConvertNode != nullptr; - - const auto mulParent0 = multiplyNode->getParentEdgeAt(0)->getParent(); - const bool withSubtract = mulParent0->getAlgorithm() == Algorithm::EltwiseSubtract; - NodePtr subtractNode, subtractConvertNode, subtractConstNode; - if (withSubtract) { - subtractNode = mulParent0; - if (!expectedNode(subtractNode, Type::Eltwise)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - auto subtractParent = subtractNode->getParentEdgeAt(1)->getParent(); - if (subtractParent->getType() == Type::Convert) { - subtractConvertNode = subtractParent; - subtractParent = subtractConvertNode->getParentEdgeAt(0)->getParent(); - } - subtractConstNode = subtractParent; - if (subtractConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - const bool withSubtractConvert = subtractConvertNode != nullptr; - const auto convertNode = withSubtract ? subtractNode->getParentEdgeAt(0)->getParent() : mulParent0; - if (!expectedNode(convertNode, Type::Convert)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const auto weightsNode = convertNode->getParentEdgeAt(0)->getParent(); - if (weightsNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Precision limitations - if (supportedDataPrecisions.find(fcNode->getOriginalInputPrecisionAtPort(0)) == supportedDataPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && - !one_of(subtractConstNode->getOriginalOutputPrecisionAtPort(0), weightsNode->getOriginalOutputPrecisionAtPort(0), ov::element::f32)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Shape limitations - const auto weightsShape = weightsNode->getOutputShapeAtPort(0); - if (weightsShape != multiplyNode->getOutputShapeAtPort(0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (reshapeNode && (reshapeNode->getInputShapeAtPort(0).getRank() != 3 || reshapeNode->getOutputShapeAtPort(0).getRank() != 2)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - VectorDims decompressionConstShape; - const auto fcInputWeightsShape = fcNode->getInputShapeAtPort(1); - int groupNum = 1; - // Ordinary case: one decompression group - if (fcInputWeightsShape.getRank() == weightsShape.getRank()) { - const auto& out_channels = fcInputWeightsShape.getDims()[0]; - decompressionConstShape = withTranspose ? VectorDims{1, out_channels} : VectorDims{out_channels, 1}; - } else { - // Group decompression case: last 3 dimension (there could be also prepending '1's in the beginning) of weights shape must be: - // [N, G, O], if transpose = true - // [O, N, G], otherwise. - // O - output channels - // N - number of groups - // G - group size - const auto& weights_dims = weightsShape.getStaticDims(); - const auto& N = withTranspose ? *(weights_dims.rbegin() + 2) : *(weights_dims.rbegin() + 1); - const auto& O = withTranspose ? *weights_dims.rbegin() : *(weights_dims.rbegin() + 2); - // Group decompression is applied by O and N dims - decompressionConstShape = withTranspose ? VectorDims{N, 1, O} : VectorDims{O, N, 1}; - groupNum = N; - } - - auto check_decompression_shape = [&decompressionConstShape](const VectorDims& shape_to_check) { - if (shape_to_check.size() > decompressionConstShape.size()) - return false; - if (std::all_of(shape_to_check.begin(), shape_to_check.end(), [](Dim x) { return x == 1; })) - return true; - const auto comparison_start_pos = decompressionConstShape.size() - shape_to_check.size(); - // in case of different ranks shapes are compared taking into account ranks numpy broadcasting - return std::equal(shape_to_check.begin(), shape_to_check.end(), decompressionConstShape.begin() + comparison_start_pos); - }; - if (!check_decompression_shape(multiplyConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && !check_decompression_shape(subtractConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - const size_t OC = fcInputWeightsShape.getDims()[0]; - const size_t IC = fcInputWeightsShape.getDims()[1]; - // HW specific shape limitations - if (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) && - fcNode->getOriginalInputPrecisionAtPort(0) == ov::element::bf16) { - // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a current solution conditions below are copied - // from OneDNN to make sure correct IP impl will be used since fallback one doesn't support weights decompression feature. - size_t simdWidth = 16; - size_t vnniFactor = 2; - size_t maxSize = 512; - auto amxRow = vnniFactor * simdWidth; - - if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - // OneDNN IP primitive provides limited decompression params support - if (IC % groupNum != 0 || IC / groupNum < 4 || OC == 1) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Fusion processing - auto *multiplyInputNode = dynamic_cast(multiplyConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", multiplyConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionMultiply(multiplyInputNode->getMemoryPtr()); - - if (withSubtract) { - auto *subtractInputNode = dynamic_cast(subtractConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", subtractConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionSubtract(subtractInputNode->getMemoryPtr()); - } - - fcNode->addOriginalLayer(multiplyNode->getOriginalLayers()); - fcNode->addOriginalLayer(convertNode->getOriginalLayers()); - if (withSubtract) - fcNode->addOriginalLayer(subtractNode->getOriginalLayers()); - if (withSubtractConvert) - fcNode->addOriginalLayer(subtractConvertNode->getOriginalLayers()); - if (withMultiplyConvert) - fcNode->addOriginalLayer(multiplyConvertNode->getOriginalLayers()); - - const auto& weightsPrecision = weightsNode->getOriginalOutputPrecisionAtPort(0); - if (withTranspose) { - transposeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - transposeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - if (withReshape) { - reshapeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - reshapeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - fcNode->setOriginalInputPrecisionAtPort(1, weightsPrecision); - - // If decompression subgraph is shared with other nodes, it mustn't be removed. - // In this case, the current FC is reconnected to the weights - if (is_shared_decompression) { - const auto weights_out_edge = weightsNode->getChildEdges()[0].lock(); - const auto fc_weights_path_edge = withTranspose ? transposeNode->getParentEdgeAt(0) - : fcNode->getParentEdgeAt(1); - const auto inNum = weights_out_edge->getInputNum(); - const auto outNum = fc_weights_path_edge->getOutputNum(); - graph.RemoveEdge(fc_weights_path_edge); - // In case of shared group decompression, Reshape node has to be copied for the current FC - if (withReshape) { - const auto& reshapeOutShape = reshapeNode->getOutputShapeAtPort(0).getStaticDims(); - auto reshapeConst = std::make_shared(ov::element::i32, - ov::Shape{reshapeOutShape.size()}, - reshapeOutShape); - auto reshapeDummyInput = std::make_shared(reshapeNode->getOriginalInputPrecisionAtPort(0), - reshapeNode->getInputShapeAtPort(0).toPartialShape()); - const auto reshape = std::make_shared(reshapeDummyInput, reshapeConst, false); - reshape->set_friendly_name(reshapeNode->getName() + "_copy"); - const auto cpuReshape = std::make_shared(reshape, graph.getGraphContext()); - graph.InsertNode(weightsNode, withTranspose ? transposeNode : fcNode, cpuReshape, inNum, outNum, false); - const auto cpuReshapeConst = std::make_shared(reshapeConst, graph.getGraphContext()); - graph.AddNode(cpuReshapeConst); - graph.CreateEdge(cpuReshapeConst, cpuReshape, 0, 1); - } else { - graph.CreateEdge(weightsNode, withTranspose ? transposeNode : fcNode, inNum, outNum); - } - } else { - // If decompression subgraph is not shared with other nodes, it can be removed - if (withSubtract) - graph.RemoveEdge(subtractNode->getParentEdgeAt(1)); - if (withSubtractConvert) { - // SubtractConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = subtractConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(subtractConvertNode->getParentEdgeAt(0)); - } - graph.RemoveEdge(multiplyNode->getParentEdgeAt(1)); - if (withMultiplyConvert) { - // MultiplyConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = multiplyConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(multiplyConvertNode->getParentEdgeAt(0)); - } - - graph.DropNode(convertNode); - if (withSubtract) - graph.DropNode(subtractNode); - graph.DropNode(multiplyNode); - } - DEBUG_LOG("FuseFCAndWeightsDecompression finished for node ", fcNode->getName()); - } -#undef SKIP_FUSION_FOR_NODE -} - void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { auto& graphNodes = graph.GetNodes(); @@ -556,7 +300,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { return false; if (!deconv) - return (one_of(node->getType(), Type::Convolution, Type::MatMul, Type::FullyConnected) && + return (one_of(node->getType(), Type::Convolution, Type::MatMul) && node->getParentEdges().size() == 2); else return deconv->canFuseBias(); @@ -984,9 +728,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { auto isSuitablePattern = [](NodePtr parent) { bool res = true && parent->getType() == Type::Transpose && parent->getChildEdges().size() == 1 - && parent->getChildEdgeAt(0)->getOutputNum() == 1 && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected - && parent->getOutputShapeAtPort(0).getRank() == 2 && parent->isConstant(); return res; }; diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index 886296a7c0053b..536ef468a09816 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -20,7 +20,6 @@ class GraphOptimizer { private: void FuseConvMatmulFCDeconvAndDQScales(Graph &graph); - void FuseFCAndWeightsDecompression(Graph &graph); void FuseConvolutionMatMulDeconvAndBias(Graph &graph); void FuseDeconvolutionAndSimpleOperation(Graph &graph); void FuseMultiplyAndAdd(Graph &graph); diff --git a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h index 4b641669262591..1575841cb2be9e 100644 --- a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h @@ -59,7 +59,9 @@ class EmptyMemoryDesc : public MemoryDesc { } MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override { - OPENVINO_THROW("Clone an empty memory desc with any precision (", prec, ") is prohibited"); + OPENVINO_ASSERT(prec == ov::element::undefined, + "Clone an empty memory desc with defined precision: ", prec, " is prohibited"); + return clone(); } private: diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index de5c53429138c4..ee0a99c3bba44e 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -6,6 +6,7 @@ #include "cpu_types.h" #include "edge.h" #include "partitioned_mem_blk.h" +#include "openvino/core/type/element_type.hpp" #include #include @@ -1673,7 +1674,7 @@ bool Node::isInputTensorAtPortEmpty(size_t port) const { auto edge = getParentEdgeAt(port); if (one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::Validated)) { auto&& mem = edge->getMemory(); - if (mem.isDefined()) { + if (mem.isDefined() && !mem.getDesc().empty()) { return mem.getShape().hasZeroDims(); } } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index cc42691950a3ff..9660178e1af4a4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -11,6 +11,7 @@ #include "nodes/executors/executor.hpp" #include "nodes/executors/memory_arguments.hpp" #include "utils/debug_capabilities.h" +#include "utils/cpu_utils.hpp" #include "nodes/executors/debug_messages.hpp" #include "nodes/executors/implementation_utils.hpp" #include "nodes/convert.h" @@ -201,9 +202,22 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, MemoryArgs memoryArgs; memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); memoryArgs[ARG_WEI] = memory.at(ARG_WEI); + + auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr(); + + // normalize weights to 2D + const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims(); + const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims); + + originalWeightsDesc = std::make_shared(originalWeightsDesc->getPrecision(), Shape{wgtDims2D}); + + auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc); + auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision); + auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); + if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = memory.at(ARG_WEI)->getShape(); + const auto& wShape = originalWeightsDesc->getShape(); const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), @@ -214,6 +228,7 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); memoryArgs[ARG_DST] = memory.at(ARG_DST); } + // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); @@ -221,13 +236,6 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); - MemoryPtr dstMemPtr = std::make_shared(context->getEngine(), - memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision)); - auto dstDesc = dstMemPtr->getDescPtr(); - auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); - auto weiDesc = memory.at(ARG_WEI)->getDescPtr(); - auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc); - if (isNeededReorder) { dnnl::impl::dim_t o_dim = 0; dnnl::impl::dim_t inner_dim = 1; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp index 8f9d7ad0805e41..61aca683a37687 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp @@ -157,8 +157,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const ConvAttrs& attrs, one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer - dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, {}, attrs.withBias, outputDataType); + DnnlPostOpsComposer dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, memory, outputDataType); return dnnlpoc.compose(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index fcb70d4753b2ce..780dbb6f2f3f11 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -115,9 +117,10 @@ DnnlMemoryDescPtr DnnlFCPrimitive::makeTransposedWeightDescriptor(const DnnlMemo return srcDesc; const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = - dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; - const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); + auto wDims = weiDesc.get_dims(); + dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); + + const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -140,12 +143,11 @@ bool DnnlFCPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputT return false; } -bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose) { +static bool useDynamicQuantizationImpl(size_t dqGroupSize, + const MemoryDescPtr srcDesc, + const MemoryDescPtr weightsDesc, + const MemoryArgs& memory, + bool needTranspose) { if (dqGroupSize == 0) return false; @@ -155,6 +157,8 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (srcDesc->getPrecision() != ov::element::f32) return false; + + MemoryCPtr zpPtr = memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS) ? memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS) : nullptr; // For dynamic quantization, VNNI accumulation requires weight to be unsigned. // To support dynamic quantization with weights symmetrically quantized as i8/i4 // w/o zero-point, we will transform weight to u8/u4 weight with zp 128/8. @@ -177,11 +181,15 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (weightsDesc->getPrecision() == ov::element::u4) { int ic = weightsDesc->getShape().getStaticDims()[1]; int minGroupSize = INT_MAX; + + MemoryCPtr scalesPtr = memory.count(ARG_WEI | ARG_ATTR_SCALES) ? memory.at(ARG_WEI | ARG_ATTR_SCALES) : nullptr; + if (scalesPtr && scalesPtr->getShape().getRank() == 3) { auto scalesDims = scalesPtr->getShape().getStaticDims(); auto groupsNum = needTranspose ? scalesDims[1] : scalesDims[0]; minGroupSize = ic / groupsNum; } + if (zpPtr && zpPtr->getShape().getRank() == 3) { auto zpDims = zpPtr->getShape().getStaticDims(); int groupsNum = needTranspose ? zpDims[1] : zpDims[0]; @@ -196,11 +204,6 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, return true; } -template -static std::vector normalizeDimsTo2D(const std::vector& dims) { - return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies()), dims[dims.size() - 1]}; -} - static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, @@ -211,7 +214,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); const auto& originalDims = dstDesc->getShape().getMinDims(); - const auto& dims = normalizeDimsTo2D(originalDims); + const auto& dims = reshapeDownToRank<2>(originalDims); auto isINT8 = one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; @@ -223,21 +226,22 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, dims.size() - 1, isINT8, 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), + memory, outputDataType); - if (attrs.decompressionMultiplyPtr) { - auto dstPrc = attrs.decompressionMultiplyPtr->getPrecision(); + if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) { + auto dstPrc = memory.at(ARG_WEI | ARG_ATTR_SCALES)->getPrecision(); if (dstPrc != f8e8m0 || useDynamicQuantization) dstPrc = ov::element::f32; - dnnlpoc.appendDecompressionScales(attrs.decompressionMultiplyPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc); } - if (attrs.decompressionSubtractPtr) { + + if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32; - dnnlpoc.appendDecompressionZeroPoints(attrs.decompressionSubtractPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), !attrs.weightsNonTransposed, dstPrc); } + if (useDynamicQuantization) { auto wei_precision = weiDesc->getPrecision(); bool is_symmetric_weights = (wei_precision == ov::element::i8) || (wei_precision == ov::element::i4); @@ -261,7 +265,7 @@ static dnnl::memory::desc normalizeDescriptor(const dnnl::memory::desc& desc) { const auto& dims = desc.get_dims(); if (dims.size() > 2) - return desc.reshape(normalizeDimsTo2D(dims)); + return desc.reshape(reshapeDownToRank<2>(dims)); return desc; } @@ -276,12 +280,13 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons const bool useWeightsDecompression) { const auto normalizedInputDesc = normalizeDescriptor(inputDesc); const auto normalizedOutputDesc = normalizeDescriptor(outputDesc); + const auto normalizedWeightDesc = normalizeDescriptor(weightDesc); const auto indt = normalizedInputDesc.get_data_type(); auto wdt = indt; if (useWeightsDecompression) { - wdt = weightDesc.get_data_type(); + wdt = normalizedWeightDesc.get_data_type(); // dynamic quantization with symmetric quantized weights needs unsigned weights uint64_t dynQuantGroupSize = 0; @@ -297,8 +302,8 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons } const dnnl::memory::desc weightsDesc = - useSparseWeights ? dnnl::memory::desc().sparse_desc(weightDesc.get_dims(), wdt) - : dnnl::memory::desc(weightDesc.get_dims(), wdt, memory::format_tag::any); + useSparseWeights ? dnnl::memory::desc().sparse_desc(normalizedWeightDesc.get_dims(), wdt) + : dnnl::memory::desc(normalizedWeightDesc.get_dims(), wdt, memory::format_tag::any); return dnnl::inner_product_forward::primitive_desc(engine, dnnl::prop_kind::forward_inference, @@ -387,8 +392,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs& useWeightsDecompression && useDynamicQuantizationImpl(attrs.dynamicQuantizationGroupSize, srcDesc, weiDesc, - attrs.decompressionMultiplyPtr, - attrs.decompressionSubtractPtr, + memory, !attrs.weightsNonTransposed); const auto postOpData = createPrimitiveAttrs(attrs, postOps, memory, context, useDynamicQuantization); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp index 5295b9655066cc..21247f149ca69f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp @@ -75,13 +75,6 @@ class DnnlFCPrimitive { const DnnlShapeAgnosticDataPtr& shapeAgnosticData); private: - static bool useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose); - dnnl::stream m_stream; dnnl::primitive_desc m_primDesc; impl_desc_type m_implType; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index 1b8646c858e532..40c365ee5f4da5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -27,6 +27,7 @@ #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/matmul_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -104,10 +105,10 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl const auto& weiDesc = srcDesc->getDnnlDesc(); auto wDims = weiDesc.get_dims(); auto wDataType = weiDesc.get_data_type(); - std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]); + dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba; - const auto transposedWeiDesc = dnnl::memory::desc{wDims, wDataType, format}; + const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -134,8 +135,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, dims.size() - 1, isINT8, 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), + memory, outputDataType); return dnnlpoc.compose(); @@ -262,7 +262,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& biasDesc = memory.at(ARG_BIAS)->getDescPtr(); auto dstDesc = memory.at(ARG_DST)->getDescPtr(); - MatMulAttrs mmAttrs{false, false, attrs.dequantizationScales}; + MatMulAttrs mmAttrs{false, false}; const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false); diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp index 09b3b33cfe6b2f..d08c4ad8127325 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp @@ -6,7 +6,6 @@ #include "post_ops.hpp" #include "memory_arguments.hpp" -#include "printers.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp index f12795d5d1eb16..dd05cc58d43c32 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp @@ -19,7 +19,6 @@ namespace ov { namespace intel_cpu { -using namespace executor; template class ExecutorFactory { diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp index ad6479597c6971..1699a845a3314b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp @@ -19,13 +19,8 @@ struct FCAttrs { bool withBias = false; bool weightsNonTransposed = false; bool sparseWeights = false; - // @todo only memory descriptors should be a part of attributes - // actual memory should be passed into "execute" or "prepareMemory" calls - std::vector dequantizationScales; - // @todo should be passed as an additional memory input? - MemoryCPtr decompressionSubtractPtr; - MemoryCPtr decompressionMultiplyPtr; uint64_t dynamicQuantizationGroupSize; + ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 4cf6992985ecd3..10f472ddcd7283 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -441,8 +441,7 @@ const std::vector>& getImplementations() { const ExecutorContext::CPtr context, std::shared_ptr shareAgnosticData) const { MatMulAttrs matMulAttrs{false, - false, - attrs.dequantizationScales}; + false}; auto primitive = DefaultInstantiator{}( memory, diff --git a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp index 9e484b24a2940e..e42bf3138bce91 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp @@ -12,7 +12,6 @@ namespace intel_cpu { struct MatMulAttrs { bool transposeA; bool transposeB; - std::vector dequantizationScales; }; using MatMulConfig = executor::Config; diff --git a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp index c04ca39e845ee1..7150226d27c601 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp @@ -14,7 +14,7 @@ namespace intel_cpu { using MemoryDescArgs = std::unordered_map; using MemoryArgs = std::unordered_map; -// @todo add more options +// basic inputs #define ARG_SRC_0 1 #define ARG_SRC ARG_SRC_0 #define ARG_SRC_1 2 @@ -24,6 +24,12 @@ using MemoryArgs = std::unordered_map; #define ARG_WEI_0 33 #define ARG_WEI ARG_WEI_0 #define ARG_BIAS 41 +// legacy dequantization scale +#define ARG_DST_DEQ_SCALE 53 +// scaling factors provided at execution time +#define ARG_ATTR_SCALES 4096 +// zero points provided at execution time +#define ARG_ATTR_ZERO_POINTS 8192 } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp index a03bfe2649413a..8fd945b773f262 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp @@ -23,6 +23,10 @@ using namespace executor; using namespace dnnl; using namespace ov::element; +static Dim batchDim(const VectorDims& dims) { + return std::accumulate(dims.begin(), dims.end() - 1, 1, std::multiplies()); +} + static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, const ExecutorContext::CPtr context, const bool weightsTransposed) { @@ -31,14 +35,15 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, // Weights are transposed by MatMulConstTransposesExtraction // K is the IC of weight // the weight is reshaped to [-1, K] in ConvertMatMulToFC - const auto K = wgtDims[1]; - const auto N = wgtDims[0]; + Dim K = wgtDims.back(); + Dim N = batchDim(wgtDims); auto packedBsize = mlas_sgemm_pack_get_size(N, K); auto create = [&]() { float* weightPtr = weightsMemory->getDataAs(); size_t ldb = weightsTransposed ? K : N; + MemoryPtr _ptr = std::make_shared(context->getEngine(), intel_cpu::CpuBlockedMemoryDesc(i8, intel_cpu::Shape{packedBsize})); float* prepackedDst = _ptr->getDataAs(); @@ -66,21 +71,10 @@ bool MlasGemmExecutor::supports(const FCConfig& config) { DEBUG_LOG("MlasGemmExecutor: PostOps are not supported"); return false; } - const auto& weiDesc = config.descs.at(ARG_WEI); - const auto& dstDesc = config.descs.at(ARG_DST); - // MLAS cannot support weight dims > 2, e.g. [1,64,9,9] * [10,64,9,9] - const auto& weightsDims = weiDesc->getShape().getStaticDims(); - if (weightsDims.size() > 2) { - if (!std::all_of(weightsDims.begin() + 2, weightsDims.end(), [](const Dim dim) { - return dim == 1; - })) { - DEBUG_LOG("MlasGemmExecutor: weights dims > 2 are not supported"); - return false; - } - } + const auto& dstDesc = config.descs.at(ARG_DST); - if (config.attrs.withBias) { + if (!config.descs.at(ARG_BIAS)->empty()) { const auto& biaDesc = config.descs.at(ARG_BIAS); const auto& biasDims = biaDesc->getShape().getStaticDims(); const auto& outDims = dstDesc->getShape().getDims(); @@ -108,24 +102,17 @@ MlasGemmExecutor::MlasGemmExecutor(const FCAttrs& attrs, const ExecutorContext::CPtr context) : m_attrs(attrs), m_memoryArgs(memory), - packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)) {} + packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)), + N(batchDim(memory.at(ARG_WEI)->getStaticDims())), + K(memory.at(ARG_WEI)->getStaticDims().back()) +{} bool MlasGemmExecutor::update(const MemoryArgs& memory) { - const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); - const auto& wgtDims = weiDesc->getShape().getStaticDims(); - // Weights are transposed by MatMulConstTransposesExtraction - // K is the IC of weight - // the weight is reshaped to [-1, K] in ConvertMatMulToFC - K = wgtDims[1]; - N = wgtDims[0]; const auto& outDims = dstDesc->getShape().getStaticDims(); - if (outDims.size() > 2) { - M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies()); - } else { - M = outDims[0]; - } + M = outDims.size() > 2 ? batchDim(outDims) : outDims[0]; + return true; } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 31ae4f26cc08a1..0f5c46e8bcd7cd 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -10,6 +10,7 @@ #include "common/cpu_convert.h" #include "common/cpu_memcpy.h" +#include "cpu_types.h" #include "dnnl_extension_utils.h" #include "executors/memory_arguments.hpp" #include "graph_context.h" @@ -19,11 +20,16 @@ #include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" +#include "ov_ops/fully_connected_compressed.hpp" #include "post_ops.hpp" #include "shape_inference/custom/fullyconnected.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "transformations/utils/utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" @@ -39,25 +45,76 @@ namespace node { bool FullyConnected::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto fc = std::dynamic_pointer_cast(op); - if (!fc) { - errorMessage = "Only legacy FullyConnected operation is supported"; + if (!ov::is_type(op) && + !ov::is_type(op) && + !ov::is_type(op)) { return false; } - if (fc->get_input_size() == 3 && - std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(BIAS_ID)) == nullptr) { - errorMessage = "Only Constant operation on 'bias' input is supported"; + + if (ov::is_type(op)) { + if (!ov::op::util::is_on_constant_path(op->input_value(BIAS))) { + errorMessage = "Only Constant operation on 'bias' input is supported"; + return false; + } + } + + if (ov::is_type(op)) { + if (!ov::op::util::is_on_constant_path(op->input_value(WEIGHT_SCALES)) || + !ov::op::util::is_on_constant_path(op->input_value(WEIGHT_ZERO_POINTS))) { + errorMessage = "Only Constant operation on 'weight scales', and 'weight zero points' inputs is supported"; + return false; + } + } + } catch (...) { + return false; + } + + return true; +} + +// @todo replace 'inferencePrecision' check with 'fc->get_input_element_type(0) == ov::element::bf16' +// after bf16 pipeline is moved to ConvertPrecision +bool FullyConnected::isSupportedCompressedOperation(const std::shared_ptr& op, + size_t IC, + size_t OC, + size_t G, + ov::element::Type inferencePrecision) noexcept { +#if defined(OPENVINO_ARCH_X86_64) + try { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) return false; + + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) + return false; + + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && + inferencePrecision == ov::element::bf16) { + // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a + // current solution conditions below are copied from OneDNN to make sure correct IP impl will be + // used since fallback one doesn't support weights decompression feature. + size_t simdWidth = 16; + size_t vnniFactor = 2; + size_t maxSize = 512; + auto amxRow = vnniFactor * simdWidth; + + if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { + return false; + } } - const auto weightRank = fc->get_input_partial_shape(WEIGHTS_ID).size(); - if (weightRank != 2) { - errorMessage = "Doesn't support 'weight' input with rank: " + std::to_string(weightRank); + + if (IC % G != 0 || IC / G < 4 || OC == 1) { return false; } + + return true; } catch (...) { return false; } return true; +#else + return false; +#endif } void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) { @@ -79,6 +136,31 @@ FullyConnected::FullyConnected(const std::shared_ptr& op, const GraphC initTensorParallelConfig(context); if (!isSupportedOperation(op, errorMessage)) OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + + m_atoi[ARG_SRC] = DATA; + m_atoi[ARG_WEI] = WEIGHTS; + m_atoi[ARG_BIAS] = BIAS; + + auto mapArgToInput = [&op](std::unordered_map& argToInput, size_t argId, size_t inputId) { + if (op->get_input_size() > inputId && + op->input(inputId).get_element_type() != ov::element::undefined) { + argToInput[argId] = inputId; + } + }; + + if (ov::is_type(op)) { + mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_SCALES, WEIGHT_SCALES); + mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_ZERO_POINTS, WEIGHT_ZERO_POINTS); + algorithm = Algorithm::FullyConnectedCompressed; + } else if (ov::is_type(op)) { + mapArgToInput(m_atoi, ARG_DST_DEQ_SCALE, 3); + algorithm = Algorithm::FullyConnectedQuantizedLegacy; + } else if (ov::is_type(op)) { + algorithm = Algorithm::FullyConnectedQuantized; + OPENVINO_THROW_NOT_IMPLEMENTED("FullyConnectedQuantized is not implemented yet"); + } else { + algorithm = Algorithm::FullyConnectedCommon; + } } bool FullyConnected::canBeExecutedInInt8() const { @@ -220,6 +302,7 @@ void FullyConnected::execTensorParallelSync() { } } } + void FullyConnected::execute(dnnl::stream strm) { initTensorParallelSync(); @@ -366,31 +449,11 @@ static bool useSparseWeightsDecompression(const NodePtr& weightsInput, return sparseRate >= minSparseRate; } -void FullyConnected::needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales) { - if (tp_cfg.enable_tensor_parallel) { - auto split_parts = [](int len, int n) { - int average = len / n; - std::vector parts(n, average); - parts.back() = len - average * (n - 1); - return parts; - }; - auto DQScales = getDQScales(); - auto split_lens = split_parts(DQScales.size(), tp_cfg.w_size); - auto split_offset = tp_cfg.w_rank * split_lens[0]; - std::vector newDQScales(split_lens[tp_cfg.w_rank]); - std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin()); - dequantizationScales = std::move(newDQScales); - } -} - void FullyConnected::initSupportedPrimitiveDescriptors() { - attrs.withBias = getOriginalInputsNumber() == 3; - - attrs.dequantizationScales = getDQScales(); - needUpdateDQScaleForTensorParallel(attrs.dequantizationScales); + attrs.withBias = getOriginalInputPrecisionAtPort(BIAS) != ov::element::undefined; - attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS_ID)->getParent(), - getOriginalInputPrecisionAtPort(DATA_ID), + attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(), + getOriginalInputPrecisionAtPort(DATA), context->getConfig().fcSparseWeiDecompressionRate); attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize; attrs.modelType = context->getConfig().modelType; @@ -406,6 +469,10 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { VecMemoryDescs srcDescs; const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); for (size_t i = 0; i < srcTypes.size(); i++) { + if (srcTypes[i] == element::undefined) { + srcDescs.push_back(MemoryDescUtils::makeEmptyDesc()); + continue; + } const auto srcDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(srcTypes[i], getInputShapeAtPort(i)); srcDescs.push_back(srcDesc); } @@ -417,23 +484,31 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { } MemoryDescArgs descs{ - {ARG_SRC, srcDescs[0]}, - {ARG_WEI, srcDescs[1]}, - {ARG_BIAS, attrs.withBias ? srcDescs[2] : MemoryDescUtils::makeEmptyDesc()}, + {ARG_SRC, srcDescs[DATA]}, + {ARG_WEI, srcDescs[WEIGHTS]}, + {ARG_BIAS, srcDescs[BIAS]}, {ARG_DST, dstDescs[0]}, }; - needUpdateScaleForTensorParallel(); - needUpdateZeroPointForTensorParallel(); - auto executionContext = std::make_shared(context, getImplPriority(), privateWeightCache); factory = std::make_shared>(attrs, postOps, executionContext, descs); const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs); NodeConfig nodeConfig; - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC)); - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI)); - if (attrs.withBias) nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS)); + nodeConfig.inConfs.resize(srcDescs.size()); + + for (const auto& desc : nodeDescriptors) { + if (m_atoi.count(desc.first)) { + nodeConfig.inConfs[m_atoi[desc.first]] = desc.second; + } + } + + // add extra inputs bypassing proper memory descriptors + // @todo pass all the input descriptors to getProperMemoryDescriptors and allow + // to ignore extra input descriptors if necessery + for (size_t i = 3; i < srcDescs.size(); i++) { + nodeConfig.inConfs[i] = srcDescs[i]; + } const int inPlace = canBeInPlace() ? 0 : -1; nodeConfig.outConfs.emplace_back(nodeDescriptors.at(ARG_DST), BlockedMemoryDesc::FULL_MASK, inPlace); @@ -443,11 +518,11 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { void FullyConnected::needSplitMemoryForTensorParallel() { if (tp_cfg.enable_tensor_parallel) { - auto src = getSrcMemoryAtPort(DATA_ID); - auto wgt = getSrcMemoryAtPort(WEIGHTS_ID); + auto src = getSrcMemoryAtPort(DATA); + auto wgt = getSrcMemoryAtPort(WEIGHTS); auto dst = getDstMemoryAtPort(0); // src - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); + memory[ARG_SRC] = getSrcMemoryAtPort(DATA); // wgt // split N direction tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size) @@ -455,7 +530,7 @@ void FullyConnected::needSplitMemoryForTensorParallel() { memory[ARG_WEI] = tp_cfg.cached_splited_weight; // bias if (attrs.withBias) { - auto bias = getSrcMemoryAtPort(BIAS_ID); + auto bias = getSrcMemoryAtPort(BIAS); auto select_bias = split_horizontal(context->getEngine(), std::move(bias), 0, tp_cfg.w_rank, tp_cfg.w_size); tp_cfg.cached_splited_bias = std::move(select_bias); } else { @@ -465,6 +540,21 @@ void FullyConnected::needSplitMemoryForTensorParallel() { // dst memory[ARG_DST] = getDstMemoryAtPort(0); tp_cfg.cached_dst = split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false); + + memory[ARG_DST | ARG_ATTR_SCALES] = split_horizontal(context->getEngine(), memory[ARG_DST | ARG_ATTR_SCALES], 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto scale_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_SCALES]); + memory[ARG_WEI | ARG_ATTR_SCALES] = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto zeropoint_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_ZERO_POINTS]); + auto element_num = zeropoint_mem->getSize() / zeropoint_mem->getPrecision().size(); + if (element_num == 1) { + tp_cfg.cached_zeropoint = zeropoint_mem; + } else { + tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + } } } @@ -473,7 +563,7 @@ void FullyConnected::needUpdateTensorParalelConfig() { // 1. weight shape is dynamic // 2. last dim can be splited. if (tp_cfg.enable_tensor_parallel) { - auto& shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape(); + auto& shape = getSrcMemoryAtPort(WEIGHTS)->getShape(); if (shape.isDynamic()) { tp_cfg.enable_tensor_parallel = false; } else if (shape.getDims()[0] < static_cast(tp_cfg.w_size)) { @@ -481,12 +571,16 @@ void FullyConnected::needUpdateTensorParalelConfig() { } } } + void FullyConnected::createPrimitive() { needUpdateTensorParalelConfig(); - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); - memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS_ID); - memory[ARG_BIAS] = attrs.withBias ? getSrcMemoryAtPort(BIAS_ID) : MemoryDescUtils::makeEmptyMemory(context); + for (const auto& entry : m_atoi) { + const auto argumentId = entry.first; + const auto inputId = entry.second; + memory[argumentId] = getSrcMemoryAtPort(inputId); + } + memory[ARG_DST] = getDstMemoryAtPort(0); needSplitMemoryForTensorParallel(); @@ -513,49 +607,6 @@ ov::element::Type FullyConnected::getRuntimePrecision() const { return getMaxPrecision(srcTypes); } -void FullyConnected::needUpdateScaleForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_scale) { - attrs.decompressionMultiplyPtr = tp_cfg.cached_scale; - } -} - -void FullyConnected::needSplitScaleForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_scale) { - auto scale_mem = std::const_pointer_cast(memory); - tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size); - } -} - -void FullyConnected::needUpdateZeroPointForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_zeropoint) { - attrs.decompressionSubtractPtr = tp_cfg.cached_zeropoint; - } -} - -void FullyConnected::needSplitZeroPointForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_zeropoint) { - auto zeropoint_mem = std::const_pointer_cast(memory); - auto element_num = memory->getSize() / memory->getPrecision().size(); - if (element_num == 1) { - tp_cfg.cached_zeropoint = std::move(zeropoint_mem); - } else { - tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); - } - } -} - -void FullyConnected::fuseDecompressionMultiply(const MemoryCPtr& memory) { - attrs.decompressionMultiplyPtr = memory; - needSplitScaleForTensorParallel(memory); -} - -void FullyConnected::fuseDecompressionSubtract(const MemoryCPtr& memory) { - attrs.decompressionSubtractPtr = memory; - needSplitZeroPointForTensorParallel(memory); -} - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 8c17228e365af4..177edd3d426339 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -6,9 +6,11 @@ #include +#include #include #include #include +#include #include #include "cpu_memory.h" @@ -65,6 +67,15 @@ class FullyConnected : public Node { bool canFuse(const NodePtr& node) const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedCompressedOperation(const std::shared_ptr& op, + size_t IC, + size_t OC, + size_t G, + ov::element::Type inferencePrecision) noexcept; + + bool isExecutable() const override { + return !isInputTensorAtPortEmpty(0); + } void prepareParams() override; void executeDynamicImpl(dnnl::stream strm) override; @@ -80,9 +91,21 @@ class FullyConnected : public Node { void toNumaNodeImpl(int numaID) override; private: - static const size_t DATA_ID = 0; - static const size_t WEIGHTS_ID = 1; - static const size_t BIAS_ID = 2; + enum InputId : size_t { + DATA = 0, + WEIGHTS, + BIAS, + WEIGHT_SCALES, + WEIGHT_ZERO_POINTS, + INPUT_SCALES, + INPUT_ZERO_POINTS, + OUTPUT_SCALES, + OUTPUT_ZERO_POINTS, + }; + + static bool isConstantInput(const std::shared_ptr& op, InputId port); + + std::unordered_map m_atoi; // memory argument id to input id void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr); @@ -92,11 +115,6 @@ class FullyConnected : public Node { void initTensorParallelSync(); void execTensorParallelSync(); void needSplitMemoryForTensorParallel(); - void needSplitScaleForTensorParallel(const MemoryCPtr& memory); - void needUpdateScaleForTensorParallel(); - void needSplitZeroPointForTensorParallel(const MemoryCPtr& memory); - void needUpdateZeroPointForTensorParallel(); - void needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales); FCAttrs attrs; PostOps postOps; diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 1f650bd8c5de17..4ccdc87ada25f1 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -7,7 +7,10 @@ #include "cpu/x64/jit_generator.hpp" #include "nodes/node_config.h" #include "openvino/core/parallel.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" #include "shape_inference/shape_inference_pass_through.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" using namespace dnnl; using namespace dnnl::impl::cpu::x64; @@ -228,9 +231,9 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte op->get_type_name(), " with name ", op->get_friendly_name()); - constOp = ov::as_type_ptr(op); - if (constOp) { + if (auto constOp = ov::as_type_ptr(op)) { constant = ConstantType::Const; + m_constOp = constOp; cloneBlobIfRequired(); } else { constant = ConstantType::StrictNoConst; @@ -238,8 +241,14 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte } void Input::cloneBlobIfRequired() { - Shape shape(constOp->get_shape().empty() ? ov::Shape(1, 1) : constOp->get_shape()); - const auto prec = constOp->get_element_type(); + const auto prec = m_constOp->get_element_type(); + + if (prec == ov::element::undefined && shape_size(m_constOp->get_shape()) == 0) { + memoryPtr = MemoryDescUtils::makeEmptyMemory(context); + return; + } + + Shape shape(m_constOp->get_shape().empty() ? ov::Shape(1, 1) : m_constOp->get_shape()); const size_t size = shape.getElementsCount(); CpuBlockedMemoryDesc memDesc(prec, shape); @@ -258,21 +267,21 @@ void Input::cloneBlobIfRequired() { // oneDNN always allocate 1byte for element type with bitWidth < 8 (u4,u1...) // but ngraph Constant uses actual bitWidth for data storage allocation // in that case we make a copy to avoid overflow - if (constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { - if (constOp->get_element_type() == element::string) { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + if (m_constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { + if (m_constOp->get_element_type() == element::string) { + memory = std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } else { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + memory = std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } } else { - if (constOp->get_element_type() == element::string) { + if (m_constOp->get_element_type() == element::string) { memory = std::make_shared(getEngine(), memDesc); - auto src = constOp->get_data_ptr(); + auto src = m_constOp->get_data_ptr(); auto dst = memory->getDataAs(); std::copy(src, src + size, dst); } else { memory = std::make_shared(getEngine(), memDesc); - memcpy(memory->getData(), constOp->get_data_ptr(), constOp->get_byte_size()); + memcpy(memory->getData(), m_constOp->get_data_ptr(), m_constOp->get_byte_size()); } } @@ -287,22 +296,22 @@ void Input::cloneBlobIfRequired() { return ptr; }; - auto isBlobAligned = [&] () { - bool blobAlignedOnSSE = true; + auto isBlobAligned = [] (const std::shared_ptr& constant) { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // Majority of arithmetic and data processing instructions in legacy SSE isa requires // the memory address in the operands must be aligned on 16-byte boundary. To ensure // safely reusing ngraph const blob memory, need to check address alignment. - const void *ptr = constOp->get_data_ptr(); - blobAlignedOnSSE = mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast(ptr) & 15) == 0); + const void *ptr = constant->get_data_ptr(); + return mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast(ptr) & 15) == 0); +#else + return true; #endif - return blobAlignedOnSSE; }; // The presence of subnormals is better to determined at IR read time. auto hasSubnormals = [&] () { if (prec == ov::element::f32) { - uint32_t const *u32data = constOp->get_data_ptr(); + uint32_t const *u32data = m_constOp->get_data_ptr(); if (!size) return false; @@ -345,7 +354,7 @@ void Input::cloneBlobIfRequired() { auto blobKey = [&] () { char ptr[32]; - snprintf(ptr, sizeof ptr, "%p", constOp->get_data_ptr()); + snprintf(ptr, sizeof ptr, "%p", m_constOp->get_data_ptr()); return getName() + "_" + std::to_string(size * prec.size()) + "_" + ptr; @@ -356,12 +365,13 @@ void Input::cloneBlobIfRequired() { prec != element::string && // IRs already have all subnormals flushed to zero, but in // read_model scenario with directly loaded original model still can have subnormals - isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) && + isBlobAligned(m_constOp) && (!needFlushDenormalsToZero || !hasSubnormals()) && // Blob should be cloned in cache only if original weights are stored on other numa node. // This is possible only in multistream case on multisocket machine. // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored. (!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1); - memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()) + + memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()) : std::const_pointer_cast( weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob()); } diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 4d7febb17ad4b7..e659ea2359aabd 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -75,7 +75,7 @@ class Input : public Node { void initSupportedPdFromMemDesc(); private: - std::shared_ptr constOp; + std::shared_ptr m_constOp; MemoryCPtr memoryPtr; bool isMeanImage = false; MemoryDescPtr extMemDesc = nullptr; diff --git a/src/plugins/intel_cpu/src/nodes/reference.cpp b/src/plugins/intel_cpu/src/nodes/reference.cpp index 5dc7c8818dd52b..b84836c869deb3 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.cpp +++ b/src/plugins/intel_cpu/src/nodes/reference.cpp @@ -29,7 +29,7 @@ Reference::Reference(const std::shared_ptr& op, : Node(op, context, ReferenceShapeInferFactory(op)), ovCoreNode(op), additionalErrorMessage(errorMessage) { if (!op->has_evaluate()) { OPENVINO_THROW_NOT_IMPLEMENTED( - "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented)"); + "Cannot fallback on ngraph reference implementation. Ngraph::Node::evaluate() is not implemented for op: ", *op); } setType(Type::Reference); diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp b/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp index 5aef73df1949bd..048b413b61a60b 100644 --- a/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp @@ -15,7 +15,7 @@ Result FCShapeInfer::infer( const VectorDims& activationShape = input_shapes[0].get(); const VectorDims& weightShape = input_shapes[1].get(); size_t activationRank = activationShape.size(); - size_t channelRank = weightShape.size() - 1; + size_t channelRank = 1; // activation weight output_shape // NCHW CoCHW NCo @@ -23,7 +23,7 @@ Result FCShapeInfer::infer( // NC CoC NCo VectorDims outputShape(out_rank, 1); // set Co - outputShape.back() = weightShape[0]; + outputShape.back() = std::accumulate(weightShape.begin(), weightShape.end() - 1, 1, std::multiplies()); // set batch dims size_t batchRank = activationRank - channelRank; size_t startIdx = out_rank - batchRank - 1; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp deleted file mode 100644 index a6d97b6a84b613..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "fully_connected.hpp" -#include "transformations/itt.hpp" - -ov::intel_cpu::FullyConnectedNode::FullyConnectedNode(const ov::Output& A, - const ov::Output& B, - const ov::Rank& output_rank, - const ov::element::Type output_type) - : Op({A, B}), m_output_rank(output_rank), m_output_type(output_type) { - validate_and_infer_types(); -} - -std::shared_ptr ov::intel_cpu::FullyConnectedNode::clone_with_new_inputs(const ov::OutputVector& new_args) const { - INTERNAL_OP_SCOPE(FullyConnectedNode_clone_with_new_inputs); - check_new_args_count(this, new_args); - - return std::make_shared(new_args.at(0), new_args.at(1), m_output_rank, m_output_type); -} - -void ov::intel_cpu::FullyConnectedNode::validate_and_infer_types() { - INTERNAL_OP_SCOPE(FullyConnectedNode_validate_and_infer_types); - const auto input_size = get_input_size(); - NODE_VALIDATION_CHECK(this, - input_size == 2, - "Number of inputs is incorrect. Current value is: ", - input_size, - ", expected: 2."); - - // Weights shape: [O, I1, ..., Im]; - // O - output channels dimensions, Ik - input channels dimensions - const auto weights_pshape = get_input_partial_shape(1); - NODE_VALIDATION_CHECK(this, - weights_pshape.is_static(), - "Weights pshape must be static"); - const auto weights_shape = weights_pshape.to_shape(); - - NODE_VALIDATION_CHECK(this, - weights_pshape.size() > 0, - "Weights rank must be greater than 0"); - - const auto o_channels = weights_pshape[0]; - - // Activations shape: [B1, ..., Bn, I1, ..., Im]; - // Bi - batch dimensions, Ik - input channels dimensions - const auto activations_pshape = get_input_partial_shape(0); - - // Result shape: [B1, ..., Bn, O] - ov::PartialShape output_pshape; - if (activations_pshape.rank().is_static()) { - size_t output_channels_dimensions_count = weights_shape.size() - 1; - for (size_t i = 0; i < activations_pshape.size() - output_channels_dimensions_count; ++i) { - output_pshape.push_back(activations_pshape[i]); - } - output_pshape.push_back(o_channels); - - NODE_VALIDATION_CHECK(this, - m_output_rank.is_static(), - "Output rank must be static if activations rank is static."); - - while (output_pshape.rank().get_length() < m_output_rank.get_length()) { - output_pshape.insert(output_pshape.begin(), 1); - } - } else { - output_pshape = ov::PartialShape::dynamic(); - } - - auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; - set_output_type(0, output_type, output_pshape); -} - -bool ov::intel_cpu::FullyConnectedNode::visit_attributes(ov::AttributeVisitor &visitor) { - INTERNAL_OP_SCOPE(FullyConnectedNode_visit_attributes); - visitor.on_attribute("out-rank", m_output_rank); - visitor.on_attribute("out-type", m_output_type); - return true; -} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp deleted file mode 100644 index d992b76cf0b79b..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "openvino/core/node.hpp" -#include "openvino/op/op.hpp" - -namespace ov { -namespace intel_cpu { - -class FullyConnectedNode : public ov::op::Op { -public: - OPENVINO_OP("FullyConnected", "cpu_plugin_opset"); - - FullyConnectedNode() = default; - - FullyConnectedNode(const ov::Output &A, - const ov::Output &B, - const ov::Rank& output_rank, - const ov::element::Type output_type = ov::element::undefined); - - bool visit_attributes(ov::AttributeVisitor &visitor) override; - - void validate_and_infer_types() override; - - std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - - ov::Rank get_output_rank() const { return m_output_rank; } - ov::element::Type get_output_type() const { return m_output_type; } - -private: - ov::Rank m_output_rank; - ov::element::Type m_output_type; -}; - -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp index f2861843a81110..da25e9aac30240 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp @@ -2,12 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "openvino/core/type/element_type.hpp" +#include "ov_ops/fully_connected.hpp" #include "convert_matmul_to_fc.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/transpose.hpp" -#include "openvino/op/reshape.hpp" #include "openvino/core/rt_info.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" @@ -135,22 +135,6 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { OPENVINO_THROW("MatMul " + matmul->get_friendly_name() + " shapes are inconsistent."); } - // Transferring from MatMul representation: [B, I, K] * [B, K, O] = [B, I, O] - // to FullyConnected representation: [I, K] * [K, O] = [I, O] - - if (rank_b != 2) { - ov::Dimension K = *(shape_b_aligned.rbegin() + 1); - OPENVINO_ASSERT(K.is_static()); - auto k_len = K.get_length(); - auto reshape_shape_values = matmul->get_transpose_b() ? std::vector{-1, k_len} : std::vector{k_len, -1}; - auto reshape_shape = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, reshape_shape_values); - fc_input_b = ov::op::util::make_try_fold(fc_input_b, reshape_shape, false); - if (!std::dynamic_pointer_cast(fc_input_b.get_node_shared_ptr())) { - new_ops.push_back(reshape_shape); - } - new_ops.push_back(fc_input_b.get_node_shared_ptr()); - } - // Weights normalization if (!matmul->get_transpose_b()) { fc_input_b = create_transpose(fc_input_b, matmul->get_friendly_name() + "/transpose_b"); @@ -169,10 +153,14 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { fc_input_b = convert; } - // Create FullyConnected - auto output_rank = matmul->get_output_partial_shape(0).rank(); - auto fc = std::make_shared(fc_input_a, fc_input_b, output_rank, - matmul->get_output_element_type(0)); + auto bias = std::make_shared(element::undefined, Shape{0}); + new_ops.push_back(bias); + + auto fc = std::make_shared(fc_input_a, + fc_input_b, + bias, + matmul->get_output_element_type(0)); + fc->set_friendly_name(matmul->get_friendly_name()); ///todo: CVS-130863 Remove after fp16_compression is copyable if (ov::fp16_compression_is_disabled(matmul)) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp index 69991802101138..7d75fcc19170d0 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp @@ -4,7 +4,7 @@ #pragma once -#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/matcher_pass.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp index 8079286d1e3ad7..03d9a294bbcab9 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp @@ -12,7 +12,7 @@ #include "openvino/pass/pattern/op/or.hpp" #include "transformations/rt_info/dequantization_node.hpp" #include "transformations/cpu_opset/common/op/power_static.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "ov_ops/fully_connected.hpp" #include "utils/general_utils.h" #include "itt.hpp" @@ -47,16 +47,16 @@ bool isConvertableToPowerStatic(const std::shared_ptr &node) { return ov::shape_size(const_shape) == 1 && input_rank.get_length() >= static_cast(const_shape.size()) && !ov::intel_cpu::one_of(node->get_input_node_shared_ptr(nonConstPort)->get_type_info(), - ov::opset1::NormalizeL2::get_type_info_static(), - ov::opset4::Interpolate::get_type_info_static(), - ov::opset1::Convolution::get_type_info_static(), - ov::opset1::GroupConvolution::get_type_info_static(), - ov::opset1::ConvolutionBackpropData::get_type_info_static(), - ov::opset1::GroupConvolutionBackpropData::get_type_info_static(), - ov::opset1::MatMul::get_type_info_static(), - ov::intel_cpu::FullyConnectedNode::get_type_info_static(), - ov::op::v0::MVN::get_type_info_static(), - ov::opset6::MVN::get_type_info_static()); + ov::opset1::NormalizeL2::get_type_info_static(), + ov::opset4::Interpolate::get_type_info_static(), + ov::opset1::Convolution::get_type_info_static(), + ov::opset1::GroupConvolution::get_type_info_static(), + ov::opset1::ConvolutionBackpropData::get_type_info_static(), + ov::opset1::GroupConvolutionBackpropData::get_type_info_static(), + ov::opset1::MatMul::get_type_info_static(), + ov::op::internal::FullyConnected::get_type_info_static(), + ov::op::v0::MVN::get_type_info_static(), + ov::opset6::MVN::get_type_info_static()); } template <> diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp new file mode 100644 index 00000000000000..d92d2d3627b65b --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fc_bias_fusion.hpp" + +#include +#include + +#include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "transformations/utils/utils.hpp" + +ov::intel_cpu::FullyConnectedBiasFusion::FullyConnectedBiasFusion() { + MATCHER_SCOPE(FullyConnectedBiasFusion); + + auto input = ov::pass::pattern::any_input(ov::pass::pattern::has_static_rank()); + auto weights = ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()); + auto bias = ov::pass::pattern::wrap_type(); + auto m_fc = ov::pass::pattern::wrap_type({input, weights, bias}, + ov::pass::pattern::consumers_count(1)); + auto m_bias = ov::pass::pattern::wrap_type(); + auto m_add = ov::pass::pattern::wrap_type({m_fc, m_bias}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + + auto add = pattern_to_output[m_add].get_node_shared_ptr(); + auto bias = pattern_to_output[m_bias].get_node_shared_ptr(); + + auto fc = pattern_to_output[m_fc].get_node_shared_ptr(); + + if (transformation_callback(fc)) { + return false; + } + + ov::Shape bias_shape(bias->get_shape()); + const ov::PartialShape& output_shape = fc->get_output_partial_shape(0); + size_t bias_size = ov::shape_size(bias_shape); + auto rank = output_shape.size(); + if (rank == 0 || output_shape[rank - 1].is_dynamic()) { + return false; + } + + if (bias_shape.empty() || static_cast(bias_shape.back()) != output_shape[rank - 1].get_length() || + bias_shape.back() != bias_size) { + return false; + } + + ov::NodeVector new_ops; + + std::shared_ptr final_bias = bias; + if (bias_shape.size() >= 2) { + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}); + final_bias = ov::op::util::make_try_fold(final_bias, reshape_const, true); + new_ops.push_back(final_bias); + } + + std::shared_ptr fc_with_bias; + + auto fc_node = ov::as_type_ptr(fc); + fc_with_bias = fc_node->clone_with_new_inputs({fc_node->input_value(0), fc_node->input_value(1), final_bias}); + + new_ops.push_back(fc_with_bias); + + fc_with_bias->set_friendly_name(add->get_friendly_name()); + ov::copy_runtime_info({fc, add}, new_ops); + ov::replace_node(add, fc_with_bias); + return true; + }; + + auto m = std::make_shared(m_add, matcher_name); + this->register_matcher(m, callback); +} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp new file mode 100644 index 00000000000000..b21cf80ad327e6 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" + +namespace ov { +namespace intel_cpu { + +class FullyConnectedBiasFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("FullyConnectedBiasFusion", "0"); + FullyConnectedBiasFusion(); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp index e681cd48ce8087..18a54dc45e173f 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "ov_ops/fully_connected.hpp" #include "move_fc_reshape_to_weights.hpp" #include #include @@ -48,7 +48,8 @@ ov::intel_cpu::MoveFCReshapeToWeights::MoveFCReshapeToWeights() { auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m}); auto data_m = any_input(); - auto fully_connected_m = wrap_type({data_m, weights_input_m}); + auto bias_m = any_input(); + auto fully_connected_m = wrap_type({data_m, weights_input_m, bias_m}); ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) { const auto fully_connected = m.get_match_root(); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp deleted file mode 100644 index 27207b3e051fdb..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "openvino/core/rt_info.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/pass/constant_folding.hpp" -#include -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/subtract.hpp" -#include "openvino/op/transpose.hpp" -#include "openvino/op/variadic_split.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" - -#include "split_fc.hpp" - -#include "itt.hpp" - -ov::intel_cpu::SplitFC::SplitFC(int sub_stream_num) { - MATCHER_SCOPE(SplitFC); - auto fc_m = ov::pass::pattern::wrap_type(); - - ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - const auto& fc_node = pattern_map.at(fc_m).get_node_shared_ptr(); - auto& rt_info = fc_node->get_rt_info(); - if (rt_info.count("parallelDomain")) { - return false; - } - - const auto src_item = fc_node->get_input_node_shared_ptr(0); - const auto fc_weight_node = fc_node->get_input_node_shared_ptr(1); - - // split happens on the first dimension. - constexpr size_t split_dim = 0; - auto split_dim_node = std::make_shared(ov::element::i32, ov::Shape{}, split_dim); - - // needn't to split fc when the dim is 0. - const auto& wgt_shape = fc_weight_node->get_shape(); - // weight shape size 660000 is a trade-off value, which is summarized and verified by LLMs. - if (wgt_shape[split_dim] <= 1 || ov::shape_size(wgt_shape) < 6600000) { - return false; - } - - // parts will be splited according the sub stream num. - int split_num = sub_stream_num + 1; - - auto split_parts = [](int len, int n) { - int average = len / n; - std::vector parts(n, average); - parts.back() = len - average * (n - 1); - return parts; - }; - - // TODO: support transpose - if (ov::is_type(fc_weight_node)) { - return false; - } - - // 1. If the model is INT4 format, split the INT4 pattern for the FuseFCAndWeightsDecompression. - // 2. If the model is NOT INT4 format, split the weight. - std::vector> wgt_node_vec(split_num); - if (ov::is_type(fc_weight_node) || ov::is_type(fc_weight_node)) { - // INT4 model should consider two patterns, including with Reshape Node and without Reshape Node. - const auto reshape_node = ov::as_type_ptr(fc_weight_node); - const auto multiply_node = reshape_node ? reshape_node->get_input_node_shared_ptr(0) : fc_weight_node; - if (!ov::is_type(multiply_node)) { - return false; - } - auto multiply_pattern = multiply_node->get_input_node_shared_ptr(1); - if (!ov::is_type(multiply_pattern)) { - return false; - } - auto subtract_node = multiply_node->get_input_node_shared_ptr(0); - if (!ov::is_type(subtract_node)) { - return false; - } - auto convert_node1 = subtract_node->get_input_node_shared_ptr(1); - if (!ov::is_type(convert_node1)) { - return false; - } - auto convert_node1_const = ov::as_type_ptr(convert_node1->get_input_node_shared_ptr(0)); - if (!convert_node1_const) { - return false; - } - auto convert_node0 = subtract_node->get_input_node_shared_ptr(0); - if (!ov::is_type(convert_node0)) { - return false; - } - auto wgt_item = convert_node0->get_input_node_shared_ptr(0); - auto cvt_prec = convert_node0->get_element_type(); - - auto split_dim_range = wgt_item->get_shape()[split_dim]; - const auto& convert_node1_shape = convert_node1->get_shape(); - bool need_to_split_convert = ov::shape_size(convert_node1_shape) > 1 && - split_dim < convert_node1_shape.size() && - convert_node1_shape[split_dim] == split_dim_range; - - // We should use VariadicSplit to split the input for FC. - std::vector> split_reshape_pattern_vec(split_num); - auto fc_dim_vec = split_parts(split_dim_range, split_num); - auto split_length = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{static_cast(split_num)}, fc_dim_vec); - - auto split_constants = [&](const std::shared_ptr& constant) { - static const std::set unsupported_by_split_element_types{ov::element::u4, ov::element::i4, ov::element::nf4}; - const auto& constant_precision = constant->get_output_element_type(0); - if (unsupported_by_split_element_types.count(constant_precision) == 0) { - auto split = std::make_shared(constant, split_dim_node, split_length); - return split->outputs(); - } - - auto convert = std::make_shared(constant, ov::element::i8); - auto split = std::make_shared(convert, split_dim_node, split_length); - ov::OutputVector res(split->get_output_size()); - for (size_t i = 0; i < split->get_output_size(); ++i) { - res[i] = std::make_shared(split->output(i), constant_precision); - } - return res; - }; - - auto split_wgts = split_constants(wgt_item); - auto split_muls = split_constants(multiply_pattern); - ov::OutputVector split_cvts; - if (need_to_split_convert) { - split_cvts = split_constants(convert_node1_const); - } - - if (reshape_node) { - auto reshape_pattern = reshape_node->get_input_node_shared_ptr(1); - auto reshape_const = ov::as_type_ptr(reshape_pattern); - if (!reshape_const) { - return false; - } - const auto reshape_vec = reshape_const->cast_vector(); - for (int i = 0; i < split_num; ++i) { - split_reshape_pattern_vec[i] = {fc_dim_vec[i], reshape_vec[1]}; - } - } - - std::vector> zp_const_vec(split_num); - for (int i = 0; i < split_num; ++i) { - zp_const_vec[i] = need_to_split_convert ? split_cvts[i] : convert_node1_const->clone_with_new_inputs({}); - } - - for (int i = 0; i < split_num; ++i) { - auto sub_parent0 = std::make_shared(split_wgts[i], cvt_prec); - auto sub_parent1 = std::make_shared(zp_const_vec[i], cvt_prec); - ov::pass::disable_constant_folding(sub_parent0); - ov::pass::disable_constant_folding(sub_parent1); - auto sub_node = std::make_shared(sub_parent0, sub_parent1); - - auto mul_node = std::make_shared(sub_node, split_muls[i]); - if (reshape_node) { - auto reshape_pattern = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, split_reshape_pattern_vec[i]); - wgt_node_vec[i] = std::make_shared(mul_node, reshape_pattern, reshape_node->get_special_zero()); - } else { - wgt_node_vec[i] = mul_node; - } - } - } else { - // get input - auto wgt_item = fc_node->get_input_node_shared_ptr(1); - - // split weight - auto split_dim_range = wgt_item->get_shape()[split_dim]; - - // We should use VariadicSplit to split input for FC. - auto fc_dim_vec = split_parts(split_dim_range, split_num); - auto split_length = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{static_cast(split_num)}, fc_dim_vec); - auto split_wgts = std::make_shared(wgt_item, - split_dim_node, - split_length); - - wgt_node_vec = split_wgts->outputs(); - } - - // create fc Nodes according to the splited weight or splited pattern. - std::vector> fc_node_vec(split_num); - for (int i = 0; i < split_num; ++i) { - fc_node_vec[i] = fc_node->clone_with_new_inputs(ov::OutputVector{src_item, wgt_node_vec[i]}); - fc_node_vec[i]->get_rt_info()["parallelDomain"] = fc_node->get_name(); - } - - // concat all small fc for result. - ov::NodeVector concat_args(std::move(fc_node_vec)); - // concat happens on the latest dimension. - constexpr size_t concat_dim = -1; - auto concat_node = std::make_shared(concat_args, concat_dim); - - // check the shape after transformation. - const auto& out_shape = fc_node->get_output_partial_shape(0); - const auto& concat_shape = concat_node->get_output_partial_shape(0); - if (concat_shape != out_shape) { - return false; - } - ov::replace_node_update_name(fc_node, concat_node); - return true; - }; - - auto m = std::make_shared(fc_m, matcher_name); - this->register_matcher(m, callback); -} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp deleted file mode 100644 index f8434770b278ef..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "openvino/pass/graph_rewrite.hpp" - -namespace ov { -namespace intel_cpu { - -/* - * Description: - * SplitFC detects FC CPU operation with and without compressed weights. - * And then splits the FC into several small FCs by output channel according to sub stream number. - * The goal is that the executor can dispatch the split FCs to different numa nodes in the system. - * As a result, the split FCs can be executed at the parallel level. - * - * Before: - * - * +-------+ +-------+ - * | X | | W | - * | | | | - * | | | | - * +-------+ +-------+ - * | | - * | | - * +---------------v---------------------------------v--------------+ - * | | - * | FullyConnected | - * | | - * +------------------------------+---------------------------------+ - * | - * | Output - * v - * - * After: - * - * +-------+ +-------+ - * | X | | W | - * | | | | - * | | | | - * +---+---+ +---+---+ - * | | - * | | - * | +-------v-------+ - * | | | - * | | VariadicSplit | - * | | | - * | +--+---------+--+ - * | | | - * | +------------------------+ | - * | | | - * +---------|------------------------+ | - * | | | | - * +----------v---------v---------+ +-----------v---------v--------+ - * | | | | - * | FullyConnected | | FullyConnected | - * | | | | - * +--------------+---------------+ +--------------+---------------+ - * | | - * | Output | Output - * | | - * +--------------v---------------------------------v---------------+ - * | | - * | Concat | - * | | - * +-------------------------------+--------------------------------+ - * | - * | - * v - */ - -class SplitFC: public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("SplitFC", "0"); - SplitFC(int sub_stream_num); -}; - -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index 20502f67d3645e..87fa1291bb7141 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -2,36 +2,67 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/core/type/element_type.hpp" #include "openvino/pass/constant_folding.hpp" -#include "openvino/op/fake_quantize.hpp" #include "openvino/pass/manager.hpp" #include "common/pass/align_matmul_input_ranks.hpp" -#include "transformations/common_optimizations/reshape_prelu.hpp" -#include "common/pass/convert_broadcast_to_tiles.hpp" +#include "transformations/common_optimizations/nop_elimination.hpp" #include "common/pass/convert_tile_to_seq_tiles.hpp" #include "common/pass/convert_matmul_to_fc.hpp" #include "common/pass/convert_to_power_static.hpp" #include "common/pass/convert_to_leaky_relu.hpp" #include "common/pass/convert_to_swish_cpu.hpp" #include "common/pass/move_fc_reshape_to_weights.hpp" -#include "common/pass/split_fc.hpp" +#include "common/pass/fc_bias_fusion.hpp" #include "transformations/convert_precision.hpp" -#include "transformations/utils/utils.hpp" +#include "transformations/op_conversions/convert_fc_to_compressed.hpp" +#include "transformations/op_conversions/convert_fc_to_quantized_legacy.hpp" #include "common/pass/rnn_sequences_optimization.hpp" #include "transformations/common_optimizations/reshape_sequence_fusion.hpp" #include "transformations/defs.hpp" +#include "config.h" +#include "nodes/fullyconnected.h" #include "itt.hpp" namespace ov { namespace intel_cpu { -inline void ConvertToCPUSpecificOpset(std::shared_ptr &model) { +inline void ConvertToCPUSpecificOpset(std::shared_ptr &model, const Config& config) { RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset); ov::pass::Manager manager("CPU:ConvertToCPUSpecificOpset"); manager.set_per_pass_validation(false); + CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC); + CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion); + + std::vector supported_activation_types { + // @todo enable for bf16 as well + // after EnforceInferencePrecision is replaced with ConvertPrecision + ov::element::f32, + }; + + std::vector supported_compressed_weights_types { + ov::element::u8, + ov::element::i8, + ov::element::u4, + ov::element::i4, + ov::element::nf4, + ov::element::f4e2m1, + }; + + CPU_REGISTER_PASS_X64( + manager, + pass::ConvertFullyConnectedToFullyConnectedCompressed, + supported_activation_types, + supported_compressed_weights_types, + [&config](const std::shared_ptr& fc, size_t IC, size_t OC, size_t G) { + return ov::intel_cpu::node::FullyConnected::isSupportedCompressedOperation( + fc, IC, OC, G, config.inferencePrecision); + }); + + CPU_REGISTER_PASS_X64(manager, pass::ConvertFCToFCQuantizedLegacy); CPU_REGISTER_PASS_X64(manager, MoveFCReshapeToWeights); CPU_REGISTER_PASS_X64(manager, ov::pass::Validate); CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 27afb95a73a1e9..f9fa372030e4cc 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -21,6 +21,7 @@ // Common transformations #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp" #include "transformations/common_optimizations/add_fake_quantize_fusion.hpp" +#include "transformations/common_optimizations/reshape_prelu.hpp" #include "transformations/fp16_compression/convert_compression_only_to_legacy.hpp" #include "transformations/common_optimizations/convert_quantize_dequantize.hpp" #include "transformations/common_optimizations/lstm_cell_fusion.hpp" @@ -319,7 +320,7 @@ void Transformations::UpToLpt() { void Transformations::CpuSpecificOpSet(void) { CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Specific); - ConvertToCPUSpecificOpset(model); + ConvertToCPUSpecificOpset(model, config); } void Transformations::PreLpt(const std::vector& defaultPrecisions) { diff --git a/src/plugins/intel_cpu/src/transformations/utils.cpp b/src/plugins/intel_cpu/src/transformations/utils.cpp index 3aa74f9ed9a970..63871868713e02 100644 --- a/src/plugins/intel_cpu/src/transformations/utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/utils.cpp @@ -4,7 +4,7 @@ #include "utils.hpp" #include "openvino/opsets/opset1.hpp" -#include "cpu_opset/common/op/fully_connected.hpp" +#include "ov_ops/fully_connected.hpp" #include "transformations/rt_info/dequantization_node.hpp" #include "transformations/utils/utils.hpp" @@ -21,7 +21,7 @@ bool has_matmul_with_compressed_weights(const std::shared_ptr& }; for (const auto& op : model->get_ops()) { - if (!ov::is_type(op) && !ov::is_type(op)) + if (!ov::is_type(op) && !ov::is_type(op)) continue; if (!op->get_input_element_type(0).is_real()) diff --git a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp index b6bd36205f985d..8ae9aa67edf9a7 100644 --- a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp +++ b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp @@ -9,6 +9,7 @@ #include #include "general_utils.h" +#include "openvino/core/except.hpp" #include "precision_support.h" namespace ov { @@ -156,5 +157,35 @@ inline std::vector makeAlignedBuffer(size_t targetSize, const std::vector } return alignedBuffer; } + +/** +* @brief Reshape a tensor down to a specific rank +* +* Examples: +* - reshapeToRank<2>({1, 2, 3, 4, 5}) == {1*2*3*4, 5} == {24, 5} +* - reshapeToRank<4>({1, 2, 3, 4, 5}) == {1*2, 3, 4, 5} == {2, 3, 4, 5} +*/ +template +std::vector reshapeDownToRank(const std::vector& dims, size_t rank) { + OPENVINO_ASSERT(rank > 0, "Rank greater than zero is expected"); + + if (dims.size() <= rank) { + return dims; + } + + const auto accEnd = dims.begin() + (dims.size() - rank + 1); + const auto acc = std::accumulate(dims.begin(), accEnd, (T)1, std::multiplies()); + + std::vector result{acc}; + result.insert(result.end(), accEnd, dims.end()); + + return result; +} + +template +std::vector reshapeDownToRank(const std::vector& dims) { + return reshapeDownToRank(dims, rank); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp index fcc983d84166c5..195d46c70e1c7c 100644 --- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp +++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp @@ -2,6 +2,7 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/core/type/element_type.hpp" #ifdef CPU_DEBUG_CAPS #include "cpu_memory.h" @@ -310,7 +311,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) { void * data = pmem->getData(); auto shape = pmem->getDesc().getShape().getDims(); - if (shape_size(shape) <= 8) { + if (shape_size(shape) <= 8 && pmem->getDesc().getPrecision() != ov::element::undefined) { auto type = pmem->getDesc().getPrecision(); auto tensor = ov::Tensor(type, shape, data); auto constop = std::make_shared(tensor); @@ -663,7 +664,7 @@ std::ostream& operator<<(std::ostream& os, const IMemory& mem) { } return os; } -// @todo remove + void print_dnnl_memory(const dnnl::memory& memory, const size_t size, const int id, const char* message) { const size_t s = memory.get_desc().get_size() / sizeof(float); std::cout << message << " " << id << " size: " << s << ", values: "; diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.h b/src/plugins/intel_cpu/src/utils/debug_capabilities.h index 7a1158d259a4a3..2646ba817dca9c 100644 --- a/src/plugins/intel_cpu/src/utils/debug_capabilities.h +++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.h @@ -3,6 +3,7 @@ // #pragma once +#include "cpu_types.h" #include "openvino/util/env_util.hpp" #ifdef CPU_DEBUG_CAPS @@ -94,6 +95,12 @@ class PrintableTimer { } }; +template +std::ostream & operator<<(std::ostream & os, const std::vector vec) { + for (const auto& element : vec) + os << element << "x"; + return os; +} std::ostream & operator<<(std::ostream & os, const PortConfig& desc); std::ostream & operator<<(std::ostream & os, const NodeConfig& desc); std::ostream & operator<<(std::ostream & os, const NodeDesc& desc); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp index 6d827614f80c54..4afdd90427b06e 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp @@ -23,7 +23,6 @@ static const std::vector& filterSpecificParamsFC() { std::vector fusingParamsSet2D_smoke { emptyFusingSpec, fusingBias, - fusingMultiplyPerChannel, fusingRelu, fusingTanh }; @@ -62,7 +61,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_f16, MatMulLayerCPUTest, testParams2D_smoke std::vector fusingParamsSet3D_smoke { emptyFusingSpec, fusingBias, - fusingMultiplyPerChannel, fusingRelu, fusingTanh }; @@ -106,7 +104,6 @@ const std::vector IS = { std::vector fusingParamsSet4D_smoke { emptyFusingSpec, - fusingMultiplyPerChannel, fusingRelu, fusingTanh }; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp index 3643427de3e9b7..9a434943893eed 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp @@ -87,7 +87,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface() << ":"; + result << configEntry.first << ", " << configEntry.second.as() << "_"; } result << ")"; result << CpuTestWithFusing::getTestCaseName(fusing_params); diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp index a5b01a2c3c2f9c..90a2fc9d0b9768 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp @@ -4,9 +4,11 @@ #include -#include "openvino/op/parameter.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" #include "custom_shape_infer.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/parameter.hpp" +#include "ov_ops/fully_connected.hpp" + namespace ov { namespace intel_cpu { namespace unit_test { @@ -16,16 +18,66 @@ using namespace ov; using namespace ov::intel_cpu; TEST(CpuShapeInfer, FC_InputSize_2) { - auto activate = std::make_shared(element::f32, PartialShape{-1, -1 }); + auto activate = std::make_shared(element::f32, PartialShape{-1, -1}); auto weight = std::make_shared(element::f32, PartialShape{5, 6}); - auto op = std::make_shared(activate, weight, ov::Rank(5), element::f32); + auto op = std::make_shared( + activate, + weight, + std::make_shared(ov::element::undefined, ov::Shape{0})); std::vector static_input_shapes = {StaticShape{720, 640}, {5, 6}}; - std::vector static_output_shapes = {StaticShape{1, 1, 1, 720, 5}}; + std::vector static_output_shapes = {StaticShape{720, 5}}; + unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes); +} + +TEST(CpuShapeInfer, FC_broadcastWeights1) { + auto activate = std::make_shared(element::f32, PartialShape{1, -1, -1}); + auto weight = std::make_shared(element::f32, PartialShape{5, 6}); + auto op = std::make_shared( + activate, + weight, + std::make_shared(ov::element::undefined, ov::Shape{0})); + std::vector static_input_shapes = {StaticShape{1, 720, 6}, {5, 6}}; + std::vector static_output_shapes = {StaticShape{1, 720, 5}}; + unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes); +} + +TEST(CpuShapeInfer, FC_broadcastWeights2) { + auto activate = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + auto weight = std::make_shared(element::f32, PartialShape{5, 6}); + auto op = std::make_shared( + activate, + weight, + std::make_shared(ov::element::undefined, ov::Shape{0})); + std::vector static_input_shapes = {StaticShape{2, 3, 720, 6}, {5, 6}}; + std::vector static_output_shapes = {StaticShape{2, 3, 720, 5}}; + unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes); +} + +TEST(CpuShapeInfer, FC_broadcastActivations1) { + auto activate = std::make_shared(element::f32, PartialShape{720, -1}); + auto weight = std::make_shared(element::f32, PartialShape{1, 5, 6}); + auto op = std::make_shared( + activate, + weight, + std::make_shared(ov::element::undefined, ov::Shape{0})); + std::vector static_input_shapes = {StaticShape{720, 6}, {1, 5, 6}}; + std::vector static_output_shapes = {StaticShape{1, 720, 5}}; unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes); } -} // namespace cpu_shape_infer -} // namespace unit_test -} // namespace intel_cpu -} // namespace ov +TEST(CpuShapeInfer, FC_broadcastActivations2) { + auto activate = std::make_shared(element::f32, PartialShape{-1, -1}); + auto weight = std::make_shared(element::f32, PartialShape{1, 1, 5, 6}); + auto op = std::make_shared( + activate, + weight, + std::make_shared(ov::element::undefined, ov::Shape{0})); + std::vector static_input_shapes = {StaticShape{720, 6}, {1, 1, 5, 6}}; + std::vector static_output_shapes = {StaticShape{1, 1, 720, 5}}; + unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes); +} +} // namespace cpu_shape_infer +} // namespace unit_test +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp index cb085920d97dc5..37df1fd6d27910 100644 --- a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp @@ -4,21 +4,20 @@ #include -#include #include - #include #include #include #include -#include +#include +#include #include #include #include -#include -#include #include "common_test_utils/ov_test_utils.hpp" +#include "openvino/op/constant.hpp" +#include "ov_ops/fully_connected.hpp" #include "transformations/rt_info/decompression.hpp" using namespace testing; @@ -26,25 +25,28 @@ using namespace ov::intel_cpu; TEST_F(TransformationTestsF, ConvertMatMulToFCTest1) { { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 1, 2, 2 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 2}, {1}); auto matmul = std::make_shared(input1, input2, true, false); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); - auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); + auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); auto transpose1 = std::make_shared(input1, transpose_constant1); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 2 }, { 1 }); - auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 2}, {1}); + auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); auto transpose2 = std::make_shared(input2, transpose_constant2); - auto matmul = std::make_shared(transpose1, transpose2, ov::Rank(3)); + auto matmul = std::make_shared( + transpose1, + transpose2, + std::make_shared(ov::element::undefined, ov::Shape{0})); - model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } @@ -78,7 +80,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest3) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto matmul = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -96,27 +101,30 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest4) { { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto matmul = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } TEST_F(TransformationTestsF, ConvertMatMulToFCTest5) { - auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{ -1, -1, 2 }); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 3, 2, 2 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2}); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2, 2}, {1}); auto matmul = std::make_shared(input1, input2, false, true); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } TEST_F(TransformationTestsF, ConvertMatMulToFCTest6) { - auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{ -1, -1, 2 }); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 3, 1, 2 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2}); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 1, 2}, {1}); auto matmul = std::make_shared(input1, input2, false, true); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } @@ -132,7 +140,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest7) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2}, {1}); - auto fc = std::make_shared(input1, input2, ov::Rank(2)); + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -151,11 +162,14 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest8) { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2}, {1}); - auto fc = std::make_shared(input1, input2, ov::Rank(2)); + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); auto a_shape = std::make_shared(input1); auto I = ov::op::util::node_to_get_shape_value_of_indices_from_shape_node(a_shape, {0, 1}); - auto O = ov::opset1::Constant::create(ov::element::i64, { 1 }, { 3 }); + auto O = ov::opset1::Constant::create(ov::element::i64, {1}, {3}); auto output_shape = std::make_shared(ov::OutputVector{I, O}, 0); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); @@ -174,7 +188,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest9) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto matmul = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -182,10 +199,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest9) { TEST_F(TransformationTestsF, ConvertMatMulToFCTest10) { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 2 }, { 1 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); auto matmul = std::make_shared(input1, input2, false, true); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } @@ -218,8 +235,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest13) { } { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 1}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{80, 1}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 80, 1}, {1}); + auto matmul = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -242,8 +262,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest14) { } { auto input1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, 1}); - auto input2 = ov::opset1::Constant::create(ov::element::i8, ov::Shape{80, 1}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::i8, ov::Shape{1, 80, 1}, {1}); + + auto matmul = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0}), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -252,7 +277,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest14) { TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_1) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3, 4, 5}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 6, 5 }, { 1 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{6, 5}, {1}); auto matmul = std::make_shared(input1, input2, false, true); model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); @@ -260,8 +285,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_1) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3, 4, 5}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 6, 5 }, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{6, 5}, {1}); + + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0}), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -278,8 +308,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_2) { } { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 1, 5}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{10, 5}, {1}); - auto fc = std::make_shared(input1, input2, ov::Rank(4)); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 10, 5}, {1}); + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -288,7 +321,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_2) { TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1}); auto matmul = std::make_shared(input1, input2, false, true); model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); @@ -296,8 +329,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1}); + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0}), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -306,7 +343,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) { TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1}); auto matmul = std::make_shared(input1, input2, false, true); model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); @@ -314,8 +351,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1}); + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0}), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -324,7 +365,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) { TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_5) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3, 2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1}); auto matmul = std::make_shared(input1, input2, false, true); model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); @@ -332,8 +373,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_5) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3, 2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1}); + auto fc = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0}), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -350,97 +395,112 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_1) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{5, 2, 3}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(2)); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1}); + auto matmul = std::make_shared( + input1, + input2, + std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_2) { { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 2, 3 }); - auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3}); + auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1}); auto matmul = std::make_shared(input1, weights, false, true); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 2, 3 }); - auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 }); - auto matmul = std::make_shared(input1, weights, ov::Rank(2)); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3}); + auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1}); + auto matmul = std::make_shared( + input1, + weights, + std::make_shared(ov::element::undefined, ov::Shape{0})); - model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_3) { { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 5, 2, 3 }); - auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 1, 2, 3 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{5, 2, 3}); + auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1}); auto matmul = std::make_shared(input1, weights, false, true); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 5, 2, 3 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{5, 2, 3}); - auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 }); - auto matmul = std::make_shared(input1, weights, ov::Rank(3)); - model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1}); + auto matmul = std::make_shared( + input1, + weights, + std::make_shared(ov::element::undefined, ov::Shape{0})); + model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_0) { { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); - auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 1, 2, 2 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); + auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1}); auto convert = std::make_shared(input2, ov::element::f32); ov::mark_as_decompression(convert); auto matmul = std::make_shared(input1, convert, false, false); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); - auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 2, 2 }, { 1 }); - auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 }); + auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1}); + auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); auto transpose = std::make_shared(input2, transpose_constant); auto convert = std::make_shared(transpose, ov::element::f32); - auto matmul = std::make_shared(input1, convert, ov::Rank(3)); + auto matmul = std::make_shared( + input1, + convert, + std::make_shared(ov::element::undefined, ov::Shape{0})); - model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_1) { { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); - auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 1, 2, 2 }, { 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); + auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1}); auto convert = std::make_shared(input2, ov::element::f32); ov::mark_as_decompression(convert); auto matmul = std::make_shared(input1, convert, true, false); - model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); manager.register_pass(); } { - auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); - auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); + auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); + auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); auto transpose1 = std::make_shared(input1, transpose_constant1); - auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 2, 2 }, { 1 }); - auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 }); + auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1}); + auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); auto transpose2 = std::make_shared(input2, transpose_constant2); auto convert = std::make_shared(transpose2, ov::element::f32); - auto matmul = std::make_shared(transpose1, convert, ov::Rank(3)); + auto matmul = std::make_shared( + transpose1, + convert, + std::make_shared(ov::element::undefined, ov::Shape{0})); - model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); + model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } @@ -467,12 +527,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_compressed_u8_weights) { auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 2}, {1}); auto mul = std::make_shared(sub, mul_const); - auto reshape_const = ov::opset1::Constant::create(ov::element::i32, {2}, {2, -1}); - auto reshape = std::make_shared(mul, reshape_const, false); - auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {2}, {1, 0}); - auto transpose = std::make_shared(reshape, transpose_const); - auto matmul = std::make_shared(data, transpose, ov::Rank(3)); + auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {3}, {0, 2, 1}); + auto transpose = std::make_shared(mul, transpose_const); + auto matmul = std::make_shared( + data, + transpose, + std::make_shared(ov::element::undefined, ov::Shape{0})); - model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ data }); + model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{data}); } } diff --git a/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp b/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp index 68241c9169bce7..b3d733aecba27b 100644 --- a/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp +++ b/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include "ov_ops/fully_connected.hpp" #include #include @@ -115,7 +115,12 @@ class MoveFCReshapeToWeightsTests : public TransformationTestsF, public WithPara auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {2}, {1, 0}); weights_path = std::make_shared(weights_path, transpose_const); } - auto fully_connected = std::make_shared(data, weights_path, ov::Rank(3)); + + auto fully_connected = std::make_shared( + data, + weights_path, + std::make_shared(ov::element::undefined, ov::Shape{0})); + return std::make_shared(ov::NodeVector{fully_connected}, ov::ParameterVector{data}); } diff --git a/src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp b/src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp deleted file mode 100644 index 4c955ec5286813..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include "openvino/core/visibility.hpp" -#include -#include - -#include "common_test_utils/ov_test_utils.hpp" -#include "transformations/rt_info/decompression.hpp" - -using namespace testing; -using namespace ov::intel_cpu; - -#if defined (OPENVINO_ARCH_ARM) && defined(__linux__) -// Ticket: 153166 -TEST_F(TransformationTestsF, DISABLED_SplitFCTest) { -#else -TEST_F(TransformationTestsF, SplitFCTest) { -#endif - disable_rt_info_check(); - { - auto src = std::make_shared(ov::element::f32, ov::Shape{ 3, 4096, 1 }); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2048, 4096 }, { 12.34 }); - - auto fc = std::make_shared(transpose_src, wgt, ov::Rank(3)); - model = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{src}); - manager.register_pass(1); - } - { - auto src = std::make_shared(ov::element::f32, ov::Shape{ 3, 4096, 1 }); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2048, 4096 }, { 12.34 }); - - auto split_dim_node = std::make_shared(ov::element::i32, ov::Shape{}, 0); - auto split_length = ov::opset1::Constant::create(ov::element::i32, ov::Shape{2}, {1024, 1024}); - auto split_wgts = std::make_shared(wgt, split_dim_node, split_length); - - auto fc0 = std::make_shared(transpose_src, split_wgts->output(0), ov::Rank(3)); - auto fc1 = std::make_shared(transpose_src, split_wgts->output(1), ov::Rank(3)); - - ov::NodeVector concat_args({fc0, fc1}); - constexpr size_t concat_dim = -1; - auto concat = std::make_shared(concat_args, concat_dim); - model_ref = std::make_shared(ov::NodeVector{concat}, ov::ParameterVector{src}); - } -} - -#if defined (OPENVINO_ARCH_ARM) && defined(__linux__) -// Ticket: 153166 -TEST_F(TransformationTestsF, DISABLED_SplitFCTest_int8_weight) { -#else -TEST_F(TransformationTestsF, SplitFCTest_int8_weight) { -#endif - disable_rt_info_check(); - { - auto src = std::make_shared(ov::element::f32, ov::Shape{3, 4096, 1}); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::u8, ov::Shape{2048, 4096}, {123}); - auto cvt_wgt = std::make_shared(wgt, ov::element::f32); - - auto zp = ov::opset1::Constant::create(ov::element::u8, ov::Shape{2048, 1}, {1}); - auto cvt_zp = std::make_shared(zp, ov::element::f32); - - auto sub = std::make_shared(cvt_wgt, cvt_zp); - - auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2}); - auto mul = std::make_shared(sub, mul_const); - - auto fc = std::make_shared(transpose_src, mul, ov::Rank(3)); - model = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{src}); - manager.register_pass(1); - } - { - auto src = std::make_shared(ov::element::f32, ov::Shape{ 3, 4096, 1 }); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::u8, ov::Shape{ 2048, 4096 }, { 123 }); - auto cvt_wgt = std::make_shared(wgt, ov::element::f32); - - auto split_dim_node = std::make_shared(ov::element::i32, ov::Shape{}, 0); - auto split_length = ov::opset1::Constant::create(ov::element::i32, ov::Shape{2}, {1024, 1024}); - - auto split_wgts = std::make_shared(wgt, split_dim_node, split_length); - auto cvt_wgt0 = std::make_shared(split_wgts->output(0), ov::element::f32); - auto cvt_wgt1 = std::make_shared(split_wgts->output(1), ov::element::f32); - - auto zp = ov::opset1::Constant::create(ov::element::u8, ov::Shape{2048, 1}, {1}); - auto split_zp = std::make_shared(zp, split_dim_node, split_length); - - auto cvt_zp0 = std::make_shared(split_zp->output(0), ov::element::f32); - auto cvt_zp1 = std::make_shared(split_zp->output(1), ov::element::f32); - - auto sub0 = std::make_shared(cvt_wgt0, cvt_zp0); - auto sub1 = std::make_shared(cvt_wgt1, cvt_zp1); - - auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2}); - auto split_mul_const = std::make_shared(mul_const, split_dim_node, split_length); - - auto mul0 = std::make_shared(sub0, split_mul_const->output(0)); - auto mul1 = std::make_shared(sub1, split_mul_const->output(1)); - - auto fc0 = std::make_shared(transpose_src, mul0, ov::Rank(3)); - auto fc1 = std::make_shared(transpose_src, mul1, ov::Rank(3)); - - ov::NodeVector concat_args({fc0, fc1}); - constexpr size_t concat_dim = -1; - auto concat = std::make_shared(concat_args, concat_dim); - model_ref = std::make_shared(ov::NodeVector{concat}, ov::ParameterVector{src}); - } -} - -#if defined (OPENVINO_ARCH_ARM) && defined(__linux__) -// Ticket: 153166 -TEST_F(TransformationTestsF, DISABLED_SplitFCTest_int4_weight) { -#else -TEST_F(TransformationTestsF, SplitFCTest_int4_weight) { -#endif - disable_rt_info_check(); - { - auto src = std::make_shared(ov::element::f32, ov::Shape{3, 4096, 1}); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 4096}, {12}); - auto cvt_wgt = std::make_shared(wgt, ov::element::f32); - - auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 1}, {1}); - auto cvt_zp = std::make_shared(zp, ov::element::f32); - - auto sub = std::make_shared(cvt_wgt, cvt_zp); - - auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2}); - auto mul = std::make_shared(sub, mul_const); - - auto fc = std::make_shared(transpose_src, mul, ov::Rank(3)); - model = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{src}); - manager.register_pass(1); - } - { - auto src = std::make_shared(ov::element::f32, ov::Shape{3, 4096, 1}); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 4096}, {12}); - auto cvt_wgt_i8 = std::make_shared(wgt, ov::element::i8); - - auto split_dim_node = std::make_shared(ov::element::i32, ov::Shape{}, 0); - auto split_length = ov::opset1::Constant::create(ov::element::i32, ov::Shape{2}, {1024, 1024}); - - auto split_wgts = std::make_shared(cvt_wgt_i8, split_dim_node, split_length); - auto cvt_wgt0_u4 = std::make_shared(split_wgts->output(0), ov::element::u4); - auto cvt_wgt1_u4 = std::make_shared(split_wgts->output(1), ov::element::u4); - auto cvt_wgt0_f32 = std::make_shared(cvt_wgt0_u4, ov::element::f32); - auto cvt_wgt1_f32 = std::make_shared(cvt_wgt1_u4, ov::element::f32); - - auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 1}, {1}); - auto cvt_zp_i8 = std::make_shared(zp, ov::element::i8); - auto split_zp = std::make_shared(cvt_zp_i8, split_dim_node, split_length); - - auto cvt_zp0_u4 = std::make_shared(split_zp->output(0), ov::element::u4); - auto cvt_zp1_u4 = std::make_shared(split_zp->output(1), ov::element::u4); - auto cvt_zp0_f32 = std::make_shared(cvt_zp0_u4, ov::element::f32); - auto cvt_zp1_f32 = std::make_shared(cvt_zp1_u4, ov::element::f32); - - auto sub0 = std::make_shared(cvt_wgt0_f32, cvt_zp0_f32); - auto sub1 = std::make_shared(cvt_wgt1_f32, cvt_zp1_f32); - - auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2}); - auto split_mul_const = std::make_shared(mul_const, split_dim_node, split_length); - - auto mul0 = std::make_shared(sub0, split_mul_const->output(0)); - auto mul1 = std::make_shared(sub1, split_mul_const->output(1)); - - auto fc0 = std::make_shared(transpose_src, mul0, ov::Rank(3)); - auto fc1 = std::make_shared(transpose_src, mul1, ov::Rank(3)); - - ov::NodeVector concat_args({fc0, fc1}); - constexpr size_t concat_dim = -1; - auto concat = std::make_shared(concat_args, concat_dim); - model_ref = std::make_shared(ov::NodeVector{concat}, ov::ParameterVector{src}); - } -} - -#if (defined OPENVINO_ARCH_ARM && defined(__linux__)) -// Ticket: 153166 -TEST_F(TransformationTestsF, DISABLED_SplitFCTest_int4_weight_reshape) { -#else -TEST_F(TransformationTestsF, SplitFCTest_int4_weight_reshape) { -#endif - disable_rt_info_check(); - { - auto src = std::make_shared(ov::element::f32, ov::Shape{ 3, 2048, 1 }); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{ 4096, 2, 1024}, { 12 }); - auto cvt_wgt = std::make_shared(wgt, ov::element::f32); - - auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{1}, { 1 }); - auto cvt_zp = std::make_shared(zp, ov::element::f32); - - auto sub = std::make_shared(cvt_wgt, cvt_zp); - - auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{4096, 2, 1}, {0.2}); - auto mul = std::make_shared(sub, mul_const); - - auto res_const = ov::opset1::Constant::create(ov::element::i32, ov::Shape{2}, {4096, 2048}); - auto reshape = std::make_shared(mul, res_const, false); - - auto fc = std::make_shared(transpose_src, reshape, ov::Rank(3)); - model = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{src}); - manager.register_pass(1); - } - { - auto src = std::make_shared(ov::element::f32, ov::Shape{ 3, 2048, 1 }); - auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); - auto transpose_src = std::make_shared(src, transpose_constant_src); - - auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{ 4096, 2, 1024 }, { 12 }); - auto cvt_wgt_i8 = std::make_shared(wgt, ov::element::i8); - - auto split_dim_node = std::make_shared(ov::element::i32, ov::Shape{}, 0); - auto split_length = ov::opset1::Constant::create(ov::element::i32, ov::Shape{2}, {2048, 2048}); - - auto split_wgts = std::make_shared(cvt_wgt_i8, split_dim_node, split_length); - auto cvt_wgt0_u4 = std::make_shared(split_wgts->output(0), ov::element::u4); - auto cvt_wgt1_u4 = std::make_shared(split_wgts->output(1), ov::element::u4); - auto cvt_wgt0_f32 = std::make_shared(cvt_wgt0_u4, ov::element::f32); - auto cvt_wgt1_f32 = std::make_shared(cvt_wgt1_u4, ov::element::f32); - - auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{1}, { 1 }); - auto zp0 = std::make_shared(zp->get_element_type(), zp->get_shape(), zp->get_data_ptr()); - auto zp1 = std::make_shared(zp->get_element_type(), zp->get_shape(), zp->get_data_ptr()); - - auto cvt_zp0 = std::make_shared(zp0, ov::element::f32); - auto cvt_zp1 = std::make_shared(zp1, ov::element::f32); - - auto sub0 = std::make_shared(cvt_wgt0_f32, cvt_zp0); - auto sub1 = std::make_shared(cvt_wgt1_f32, cvt_zp1); - - auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{4096, 2, 1}, {0.2}); - auto split_mul_const = std::make_shared(mul_const, split_dim_node, split_length); - - auto mul0 = std::make_shared(sub0, split_mul_const->output(0)); - auto mul1 = std::make_shared(sub1, split_mul_const->output(1)); - - std::vector reshape_pattern_vec = {2048, 2048}; - auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, reshape_pattern_vec); - auto reshape0 = std::make_shared(mul0, reshape_pattern, false); - auto reshape1 = std::make_shared(mul1, reshape_pattern, false); - - auto fc0 = std::make_shared(transpose_src, reshape0, ov::Rank(3)); - auto fc1 = std::make_shared(transpose_src, reshape1, ov::Rank(3)); - - ov::NodeVector concat_args({fc0, fc1}); - constexpr size_t concat_dim = -1; - auto concat = std::make_shared(concat_args, concat_dim); - model_ref = std::make_shared(ov::NodeVector{concat}, ov::ParameterVector{src}); - } -} diff --git a/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp index 8ca920d421040f..d781d92b57052a 100644 --- a/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp +++ b/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp @@ -88,6 +88,7 @@ void TransformationTestsF::TearDown() { ASSERT_TRUE(res.valid) << res.message; comparator.disable(FunctionsComparator::CmpValues::ACCURACY); } + auto res = comparator.compare(model, model_ref); ASSERT_TRUE(res.valid) << res.message; } From 0762993323c509eeffd2cae48492607dac936903 Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Tue, 10 Dec 2024 08:41:52 +0100 Subject: [PATCH 22/23] Docs Port for sitemap update to master (#27977) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../openvino_custom_sphinx_sitemap/__init__.py | 2 +- docs/sphinx_setup/_static/js/custom.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py index 6bdd3288f8069c..c578b82c360a53 100644 --- a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py +++ b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py @@ -155,6 +155,6 @@ def extract_hierarchy(link): return ';'.join(hierarchy) def format_segment(segment): - if segment == 'c_cpp_api': segment = 'c_c++_api' + if segment == 'c_cpp_api': segment = 'C/C++_api' return ' '.join(word.capitalize() for word in segment.replace('-', ' ').replace('_', ' ').split()) \ No newline at end of file diff --git a/docs/sphinx_setup/_static/js/custom.js b/docs/sphinx_setup/_static/js/custom.js index 241f8895ee1c61..95f9549959e102 100644 --- a/docs/sphinx_setup/_static/js/custom.js +++ b/docs/sphinx_setup/_static/js/custom.js @@ -189,7 +189,7 @@ function getCurrentVersion() { if (wordAfterDomain === 'cn') { wordAfterDomain = link[2]; } - if (["index.html", "404.html", "", "latest"].indexOf(wordAfterDomain) >= 0) { + if (["index.html", "404.html", ""].indexOf(wordAfterDomain) >= 0) { /* * If this landing page, 404 or domain.com we should get first version * */ @@ -426,7 +426,7 @@ document.addEventListener('DOMContentLoaded', function () { const searchInterfaceSa = document.querySelector("#sa-search"); const searchInterface = document.querySelector("#search"); const currentVersion = getCurrentVersion(); - + await initializeSearchInterface(searchInterfaceSa, currentVersion); await initializeSearchInterface(searchInterface); From be0ab30ac93be815a34ee20a92348b3220bbf5e1 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Tue, 10 Dec 2024 11:56:06 +0400 Subject: [PATCH 23/23] [JAX FE] Support square operation (#27978) **Details:** It appears since JAX 0.4.36 **Ticket:** 158994 Signed-off-by: Kazantsev, Roman --- src/frontends/jax/src/op/square.cpp | 28 ++++++++++++++ src/frontends/jax/src/op_table.cpp | 2 + tests/constraints.txt | 6 +-- tests/layer_tests/jax_tests/test_square.py | 44 ++++++++++++++++++++++ 4 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 src/frontends/jax/src/op/square.cpp create mode 100644 tests/layer_tests/jax_tests/test_square.py diff --git a/src/frontends/jax/src/op/square.cpp b/src/frontends/jax/src/op/square.cpp new file mode 100644 index 00000000000000..268debb7992ba8 --- /dev/null +++ b/src/frontends/jax/src/op/square.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/jax/node_context.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/power.hpp" +#include "openvino/op/squeeze.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace jax { +namespace op { + +using namespace ov::op; + +OutputVector translate_square(const NodeContext& context) { + num_inputs_check(context, 1, 1); + auto x = context.get_input(0); + auto const_two = create_same_type_const_scalar(x, 2); + return {std::make_shared(x, const_two)}; +}; + +} // namespace op +} // namespace jax +} // namespace frontend +} // namespace ov diff --git a/src/frontends/jax/src/op_table.cpp b/src/frontends/jax/src/op_table.cpp index 98f22452c5afab..3ca58745bc1909 100644 --- a/src/frontends/jax/src/op_table.cpp +++ b/src/frontends/jax/src/op_table.cpp @@ -53,6 +53,7 @@ OP_CONVERTER(translate_reduce_window_sum); OP_CONVERTER(translate_reshape); OP_CONVERTER(translate_rsqrt); OP_CONVERTER(translate_slice); +OP_CONVERTER(translate_square); OP_CONVERTER(translate_squeeze); OP_CONVERTER(translate_transpose); @@ -92,6 +93,7 @@ const std::map get_supported_ops_jaxpr() { {"rsqrt", op::translate_rsqrt}, {"reshape", op::translate_reshape}, {"slice", op::translate_slice}, + {"square", op::translate_square}, {"sqrt", op::translate_1to1_match_1_input}, {"squeeze", op::translate_squeeze}, {"stop_gradient", op::skip_node}, diff --git a/tests/constraints.txt b/tests/constraints.txt index 004a2c65b5e474..4f46cd0cc8b2e9 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -21,11 +21,11 @@ pytest>=5.0,<8.4 pytest-dependency==0.5.1 pytest-html==4.1.1 pytest-timeout==2.3.1 -jax<=0.4.35 -jaxlib<=0.4.35 +jax<=0.4.36 +jaxlib<=0.4.36 kornia==0.7.0 networkx<=3.3 -flax<=0.10.0 +flax<=0.10.2 --extra-index-url https://download.pytorch.org/whl/cpu torch~=2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" diff --git a/tests/layer_tests/jax_tests/test_square.py b/tests/layer_tests/jax_tests/test_square.py new file mode 100644 index 00000000000000..32e842d182e90e --- /dev/null +++ b/tests/layer_tests/jax_tests/test_square.py @@ -0,0 +1,44 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import jax +import numpy as np +import pytest +from jax import numpy as jnp + +from jax_layer_test_class import JaxLayerTest + +rng = np.random.default_rng(34455) + + +class TestSquare(JaxLayerTest): + def _prepare_input(self): + if np.issubdtype(self.input_type, np.floating): + x = rng.uniform(-8.0, 8.0, self.input_shape).astype(self.input_type) + elif np.issubdtype(self.input_type, np.signedinteger): + x = rng.integers(-8, 8, self.input_shape).astype(self.input_type) + else: + x = rng.integers(0, 8, self.input_shape).astype(self.input_type) + x = jnp.array(x) + return [x] + + def create_model(self, input_shape, input_type): + self.input_shape = input_shape + self.input_type = input_type + + def jax_square(x): + return jax.numpy.square(x) + + return jax_square, None, None + + @pytest.mark.parametrize("input_shape", [[2], [3, 4]]) + @pytest.mark.parametrize("input_type", [np.int8, np.uint8, np.int16, np.uint16, + np.int32, np.uint32, np.int64, np.uint64, + np.float16, np.float32, np.float64]) + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.precommit_jax_fe + def test_square(self, ie_device, precision, ir_version, input_shape, input_type): + self._test(*self.create_model(input_shape, input_type), + ie_device, precision, + ir_version)